├── .gitattributes ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── MarcoPolo ├── __init__.py ├── regression │ ├── __init__.py │ ├── datasets.py │ ├── models.py │ └── trainer.py ├── summarization │ ├── __init__.py │ └── summarizer.py ├── utils.py └── visualization │ ├── __init__.py │ ├── reporter.py │ └── template │ ├── assets │ ├── details_close.png │ ├── details_open.png │ ├── mp.png │ ├── mp_white.png │ ├── mp_white_large_font.png │ ├── scripts.js │ └── styles.css │ └── index.html ├── README.md ├── assets ├── mp.png └── overview.png ├── notebooks ├── MarcoPolo.ipynb └── example │ ├── HumanLiver.h5ad │ └── hESC.h5ad └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.ipynb filter=nbstripout 3 | *.ipynb diff=ipynb 4 | *.ipynb linguist-language=Python 5 | *.css linguist-detectable=false 6 | *.js linguist-detectable=false 7 | *.html linguist-detectable=false 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | results/* 2 | wandb/* 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | ### PyCharm ### 11 | .idea 12 | 13 | ### Jupyter Notebook ### 14 | .ipynb_checkpoints 15 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MarcoPolo is free for academic and non-commercial use. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.yml 3 | 4 | include LICENSE 5 | 6 | recursive-include MarcoPolo/visualization * 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /MarcoPolo/__init__.py: -------------------------------------------------------------------------------- 1 | from MarcoPolo.regression.trainer import run_regression 2 | from MarcoPolo.summarization.summarizer import find_markers 3 | from MarcoPolo.visualization.reporter import generate_report -------------------------------------------------------------------------------- /MarcoPolo/regression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/regression/__init__.py -------------------------------------------------------------------------------- /MarcoPolo/regression/datasets.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | class CellDataset(Dataset): 4 | def __init__(self, y, x, s): 5 | self.y = y 6 | self.x = x 7 | self.s = s 8 | 9 | def __len__(self): 10 | return self.y.shape[0] 11 | 12 | def __getitem__(self, idx): 13 | item = {"Y": self.y[idx, :], "X": self.x[idx, :], "s": self.s[idx]} 14 | return item 15 | 16 | if __name__ == '__main__': 17 | from torch.utils.data import DataLoader 18 | from scipy.io import mmread 19 | import numpy as np 20 | import pandas as pd 21 | 22 | Y_=mmread('../datasets/koh_extract/koh.data.counts.mm').toarray().astype(float).transpose() 23 | s_=pd.read_csv('../datasets/analysis/koh.size_factor_cluster.tsv',sep='\t',header=None)[0].values.astype(float)#.reshape(-1,1) 24 | X_=np.array([np.ones(Y_.shape[0])]).transpose() 25 | 26 | cell_dataset = CellDataset(Y_, X_, s_) 27 | cell_dataloader = DataLoader(dataset=cell_dataset, shuffle=False, batch_size=Y_.shape[0], num_workers=0) 28 | -------------------------------------------------------------------------------- /MarcoPolo/regression/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | torch.set_default_dtype(torch.float64) 7 | 8 | class Masked_Function(torch.autograd.Function): 9 | @staticmethod 10 | def forward(ctx, input, mask): 11 | output=input 12 | ctx.save_for_backward(input, mask) 13 | return output 14 | 15 | @staticmethod 16 | def backward(ctx, grad_output): 17 | input, mask = ctx.saved_tensors 18 | grad_input = grad_mask = None 19 | if ctx.needs_input_grad[0]: 20 | grad_input = grad_output.mul(mask) 21 | 22 | return grad_input, grad_mask 23 | 24 | class Masked(nn.Module): 25 | def __init__(self, mask): 26 | super(Masked, self).__init__() 27 | 28 | self.mask = nn.Parameter(torch.Tensor(mask)==1, requires_grad=False) 29 | 30 | def forward(self, input): 31 | return Masked_Function.apply(input, self.mask) 32 | 33 | def extra_repr(self): 34 | return 'mask={}'.format(self.mask.shape) 35 | 36 | 37 | class Poisson_logprob(nn.Module): 38 | def __init__(self): 39 | super(Poisson_logprob,self).__init__() 40 | 41 | def forward(self,rate,value): 42 | return (rate.log() * value) - rate - (value + 1).lgamma() 43 | 44 | poisson_logprob=Poisson_logprob() 45 | 46 | 47 | class Poisson_Function(torch.autograd.Function): 48 | @staticmethod 49 | def forward(ctx, Y, X, s, delta_log, beta, mask, to_return='LL'): 50 | 51 | with torch.no_grad(): 52 | mu=torch.exp((X.matmul(beta)+torch.log(s.view(-1, 1))).unsqueeze(dim=1).repeat(1,delta_log.shape[0],1)+torch.exp(delta_log)*mask) 53 | Y_extend=Y.unsqueeze(dim=1).repeat(1,mu.shape[1],1) 54 | Y_logprob=poisson_logprob(rate=mu,value=Y_extend) # (N,C,G) 55 | Y_logprob_reduce=Y_logprob.sum(axis=2) # (N,C) 56 | 57 | Y_logprob_reduce_reduce=torch.logsumexp(Y_logprob_reduce,dim=1).view(-1,1) # (N,1) 58 | 59 | LL=torch.sum(Y_logprob_reduce_reduce) # (1) 60 | 61 | gamma=torch.exp(Y_logprob_reduce-Y_logprob_reduce_reduce) 62 | A=mu-Y.unsqueeze(dim=1) 63 | 64 | #gradient 65 | grad_delta_log=(A*gamma.unsqueeze(dim=2)).sum(axis=0) 66 | grad_beta=(X.unsqueeze(dim=2)@gamma.unsqueeze(dim=1)@A).sum(axis=0) 67 | 68 | ctx.save_for_backward(grad_delta_log,grad_beta) 69 | 70 | if to_return=='LL': 71 | return LL 72 | elif to_return=='gamma': 73 | return gamma 74 | else: 75 | raise 76 | 77 | @staticmethod 78 | def backward(ctx, grad_output): 79 | 80 | grad_Y = grad_X = grad_s = grad_delta_log = grad_beta = grad_mask=None 81 | grad_delta_log,grad_beta = ctx.saved_tensors 82 | 83 | return grad_Y, grad_X, grad_s, grad_delta_log, grad_beta, grad_mask 84 | 85 | 86 | class MarcoPoloModel(nn.Module): 87 | def __init__(self, Y, rho, X_col=5, delta_min=2): 88 | # Y,rho are needed for model parameter initialization 89 | super(MarcoPoloModel, self).__init__() 90 | 91 | # rho 92 | self.masked = Masked(rho) 93 | self.init_paramter_rho(rho) 94 | # delta 95 | with np.errstate(divide='ignore'): 96 | self.delta_log_min = np.log(delta_min) 97 | self.delta_log = nn.Parameter(torch.Tensor(np.ones(rho.shape)), requires_grad=True) # (C,G) 98 | self.init_parameter_delta_min(delta_min) 99 | #beta 100 | self.beta=nn.Parameter(torch.Tensor(np.ones((X_col,Y.shape[1]))),requires_grad=True) # (P,G) 101 | self.init_paramter_Y(Y) 102 | 103 | def init_paramter_rho(self,rho): 104 | self.masked.mask.data=torch.Tensor((rho==1)).to(self.masked.mask.device) 105 | 106 | def init_parameter_delta_min(self,delta_min): 107 | with np.errstate(divide='ignore'): 108 | self.delta_log_min=np.log(delta_min) # 109 | if delta_min==0: 110 | self.delta_log.data=torch.Tensor(np.random.uniform(np.log(2)-0.1,np.log(2)+0.1,size=self.delta_log.shape)).to(self.delta_log.device) # (C,G) 111 | else: 112 | self.delta_log.data=torch.Tensor(np.random.uniform(self.delta_log_min-0.1,self.delta_log_min+0.1,size=self.delta_log.shape)).to(self.delta_log.device) # (C,G) 113 | self.delta_log.data=self.delta_log.data.clamp(min=self.delta_log_min) 114 | 115 | def init_paramter_Y(self,Y): 116 | Y_colmean=np.mean(Y,axis=0) 117 | beta_init=np.hstack([((Y_colmean-Y_colmean.mean())/(np.std(Y_colmean) if len(Y_colmean)>1 else 1)).reshape(-1,1), np.zeros((Y.shape[1],self.beta.shape[0]-1))]).T 118 | self.beta.data[:]=torch.Tensor(beta_init).to(self.beta.device) 119 | 120 | def forward(self, Y, X, s, to_return='LL'): 121 | """ 122 | Get Y, X, and s and return LL or gamma 123 | 124 | Args: 125 | Y (torch.Tensor): Tensor of shape (number of cells, number of genes) 126 | X (torch.Tensor): Tensor of shape (number of cells, number of covariates) 127 | s (torch.Tensor): Tensor of shape (num. of cells, ) 128 | to_return: 'LL' or 'gamma' 129 | 130 | Returns: 131 | torch.Tensor: Log-likelihood of the data 132 | """ 133 | if to_return=='LL': 134 | delta_log_masked=self.masked(self.delta_log) #(C,G) 135 | LL=Poisson_Function.apply(Y, X, s, delta_log_masked, self.beta, self.masked.mask) 136 | return LL 137 | elif to_return=='gamma': 138 | with torch.no_grad(): 139 | gamma=Poisson_Function.apply(Y, X, s, self.delta_log, self.beta, self.masked.mask, 'gamma') 140 | return gamma 141 | else: 142 | raise ValueError('to_return must be either "LL" or "gamma"') 143 | 144 | if __name__ == '__main__': 145 | model = MarcoPoloModel(Y=np.ones((5, 5)), rho=np.ones((5, 5))) 146 | a = model(Y=torch.Tensor(np.ones((5, 5))), X=torch.Tensor(np.ones((5, 5))), s=torch.Tensor(np.ones((5, 1)))) 147 | a.backward() 148 | 149 | -------------------------------------------------------------------------------- /MarcoPolo/regression/trainer.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import multiprocessing 3 | 4 | import anndata as ad 5 | import numpy as np 6 | import pandas as pd 7 | from typing import Union, List, Tuple 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | from torch.utils.data import DataLoader 13 | from tqdm import tqdm 14 | # from tqdm import tqdm_notebook as tqdm 15 | 16 | from MarcoPolo.regression.models import MarcoPoloModel 17 | from MarcoPolo.regression.datasets import CellDataset 18 | 19 | torch.set_default_dtype(torch.float64) 20 | 21 | 22 | def fit_one_gene(model: nn.Module, optimizer: optim.Adamax, cell_dataloader: DataLoader, device: str, 23 | EM_ITER_MAX: float, M_ITER_MAX: float, LL_diff_tolerance: float, Q_diff_tolerance: float, 24 | verbose: bool = True): 25 | """ 26 | Run EM trick algorithm. 27 | Args: 28 | model: Model to be trained 29 | optimizer: Optimizer to be used 30 | cell_dataloader: DataLoader for training 31 | device: device to use. If you want to use GPU set to 'cuda:0'. If you want to use CPU set to 'cpu' 32 | EM_ITER_MAX: maximum number of iterations of E-step of the EM trick algorithm 33 | M_ITER_MAX: maximum number of iterations of M-step of the EM trick algorithm 34 | LL_diff_tolerance: tolerance for the difference of log likelihood between two iterations of the EM trick algorithm 35 | Q_diff_tolerance: tolerance for the difference of Q between two iterations of the EM trick algorithm 36 | verbose: 37 | 38 | Returns: 39 | gamma_new: gamma after EM trick algorithm 40 | LL_new: log likelihood after EM trick algorithm 41 | Q_new: Q after EM trick algorithm 42 | em_idx_max: number of iterations of the EM trick algorithm 43 | m_idx_max: number of iterations of the M-step of the EM trick algorithm 44 | """ 45 | if verbose: 46 | print('Start time:', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 47 | 48 | with torch.no_grad(): 49 | for batch_idx, batch in enumerate(cell_dataloader): 50 | batch_Y = batch['Y'].to(device) 51 | batch_X = batch['X'].to(device) 52 | batch_s = batch['s'].to(device) 53 | LL_old = model(batch_Y, batch_X, batch_s) 54 | Q_old = LL_old 55 | if verbose: 56 | print(LL_old) 57 | 58 | em_idx_max = 0 59 | m_idx_max = 0 60 | 61 | for em_idx in range(EM_ITER_MAX): # 62 | LL_new = torch.zeros_like(LL_old) 63 | for batch_idx, batch in enumerate(cell_dataloader): 64 | # Usually batch size is the size of samples and only one batch is used. But if the dataset is too large, it is better to use multiple batches. 65 | batch_Y = batch['Y'].to(device) 66 | batch_X = batch['X'].to(device) 67 | batch_s = batch['s'].to(device) 68 | 69 | ############# 70 | # M-step 71 | ############# 72 | for m_idx in range(M_ITER_MAX): 73 | optimizer.zero_grad() 74 | Q_new = -model(batch_Y, batch_X, batch_s) 75 | Q_new.backward() 76 | optimizer.step() 77 | 78 | # Constraint 79 | model.delta_log.data = model.delta_log.data.clamp(min=model.delta_log_min) 80 | # model.NB_basis_a.data=model.NB_basis_a.data.clamp(min=0) 81 | 82 | if m_idx % 20 == 0: 83 | Q_diff = (Q_old - Q_new) / torch.abs(Q_old) 84 | Q_old = Q_new 85 | if verbose: 86 | print('M: {}, Q: {} Q_diff: {}'.format(m_idx, Q_new, Q_diff)) 87 | if m_idx > 0 and torch.abs(Q_diff) < (Q_diff_tolerance): 88 | if verbose: 89 | print('M break') 90 | break 91 | m_idx_max = max(m_idx_max, m_idx) 92 | 93 | ############# 94 | # Look at LL 95 | ############# 96 | with torch.no_grad(): 97 | LL_temp = -Q_new 98 | LL_new += LL_temp 99 | 100 | LL_diff = (LL_new - LL_old) / torch.abs(LL_old) 101 | LL_old = LL_new 102 | 103 | if verbose: 104 | print('EM: {}, LL: {} LL_diff: {}'.format(em_idx, LL_new, LL_diff)) 105 | if LL_diff < LL_diff_tolerance: 106 | if verbose: 107 | print('EM break') 108 | break 109 | em_idx_max = max(em_idx_max, em_idx) 110 | 111 | if verbose: 112 | print('End time:', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 113 | 114 | with torch.no_grad(): 115 | gamma_new = model(batch_Y, batch_X, batch_s, to_return='gamma') 116 | 117 | return gamma_new, Q_new, LL_new, em_idx_max, m_idx_max 118 | 119 | 120 | def fit_multiple_genes(Y: np.array, X: np.array, s: np.array, 121 | num_cluster_list: List, learning_rate: float, fit_one_gene_parameters: dict, device: str, 122 | start_gene_idx: Union[None, int] = None, end_gene_idx: Union[None, int] = None, 123 | verbose: bool = False)-> dict: 124 | """ 125 | Fit multiple genes. 126 | 127 | Args: 128 | Y (np.array): matrix of gene expression (cell, gene) 129 | X (np.array): matrix of cell covariates (cell, feature) 130 | s (np.array): vector of cell sizes (cell, ) 131 | num_cluster_list (list): list of number of clusters to test 132 | learning_rate (float): learning rate of the optimizer 133 | fit_one_gene_parameters (dict): parameters for fit_one_gene 134 | start_gene_idx (Union[None, int]): start gene index. required for multithreading. If None, start from the first gene. 135 | end_gene_idx (Union[None, int]): end gene index. required for multithreading. If None, end at the last gene. 136 | device (str): device to use. If you want to use GPU set to 'cuda:0'. If you want to use CPU set to 'cpu' 137 | verbose (bool): if True, print out the progress 138 | 139 | Returns: 140 | regression_result: a dictionary containing the regression results with the following fields: `gamma_list_cluster`, `delta_log_cluster`, `beta_cluster`, `result_cluster`. 141 | 142 | """ 143 | 144 | if start_gene_idx is not None or end_gene_idx is not None: 145 | Y_select = Y[:, start_gene_idx:end_gene_idx] 146 | else: 147 | Y_select=Y 148 | 149 | device = torch.device(device) 150 | 151 | gamma_cluster = {} 152 | 153 | Q_cluster = {} 154 | LL_cluster = {} 155 | em_idx_max_cluster = {} 156 | m_idx_max_cluster = {} 157 | 158 | delta_log_cluster = {} 159 | beta_cluster = {} 160 | 161 | for idx, num_cluster in enumerate(num_cluster_list): 162 | gamma_list = [] 163 | 164 | Q_list = [] 165 | LL_list = [] 166 | em_idx_max_list = [] 167 | m_idx_max_list = [] 168 | 169 | delta_log_list = [] 170 | beta_list = [] 171 | 172 | if len(multiprocessing.current_process()._identity) == 0 or multiprocessing.current_process()._identity[0] == 1: 173 | print(f'({idx+1}) Fitting with {num_cluster} cluster(s)') 174 | pbar = tqdm(np.arange(Y_select.shape[1]), desc='Progress') 175 | else: 176 | pbar = np.arange(Y_select.shape[1]) 177 | 178 | for iter_idx, exp_data_idx in enumerate(pbar): 179 | cell_dataset = CellDataset(Y_select[:, iter_idx:iter_idx + 1], X, s) 180 | 181 | cell_dataloader = DataLoader(dataset=cell_dataset, shuffle=False, batch_size=Y_select.shape[0], 182 | num_workers=0) 183 | 184 | if iter_idx == 0: 185 | model = MarcoPoloModel(Y=Y_select[:, iter_idx:iter_idx + 1], rho=np.ones((num_cluster, 1)), 186 | X_col=X.shape[1], 187 | delta_min=0).to(device) 188 | else: 189 | model.init_parameter_delta_min(0) 190 | model.init_paramter_Y(Y_select[:, iter_idx:iter_idx + 1]) 191 | optimizer = optim.Adamax(model.parameters(), lr=learning_rate) # ,betas=(0.92, 0.999)) 192 | gamma, Q, LL, em_idx_max, m_idx_max = fit_one_gene(model=model, optimizer=optimizer, 193 | cell_dataloader=cell_dataloader, device=device, 194 | **fit_one_gene_parameters, verbose=verbose) 195 | 196 | gamma_list.append(gamma.cpu().numpy()) 197 | 198 | Q_list.append(Q.detach().cpu().numpy()) 199 | LL_list.append(LL.detach().cpu().numpy()) 200 | em_idx_max_list.append(em_idx_max) 201 | m_idx_max_list.append(m_idx_max) 202 | 203 | delta_log_list.append(model.delta_log.detach().cpu().numpy()) 204 | beta_list.append(model.beta.detach().cpu().numpy()) 205 | 206 | gamma_cluster[num_cluster] = gamma_list 207 | 208 | delta_log_cluster[num_cluster] = delta_log_list 209 | 210 | beta_cluster[num_cluster] = beta_list 211 | 212 | Q_cluster[num_cluster] = Q_list 213 | LL_cluster[num_cluster] = LL_list 214 | em_idx_max_cluster[num_cluster] = em_idx_max_list 215 | m_idx_max_cluster[num_cluster] = m_idx_max_list 216 | 217 | result_cluster = {num_cluster: pd.DataFrame([Q_cluster[num_cluster], 218 | LL_cluster[num_cluster], 219 | em_idx_max_cluster[num_cluster], 220 | m_idx_max_cluster[num_cluster]], 221 | index=['Q', 'LL', 'em_idx_max', 'm_idx_max']).T 222 | for num_cluster in num_cluster_list} 223 | 224 | regression_result = {"gamma_list_cluster": gamma_cluster, 225 | "delta_log_cluster": delta_log_cluster, 226 | "beta_cluster": beta_cluster, 227 | "result_cluster": result_cluster, } 228 | return regression_result 229 | 230 | 231 | def run_regression(adata: ad.AnnData, size_factor_key: Union[str, None], covariates=None, 232 | num_cluster_list=[1, 2], learning_rate=0.1, 233 | EM_ITER_MAX=20, M_ITER_MAX=10000, LL_diff_tolerance=1e-4, Q_diff_tolerance=1e-4, 234 | device: str='cuda:0', num_threads=1, verbose=False)->dict: 235 | """ 236 | Run regression. 237 | 238 | Args: 239 | adata: anndata.AnnData containing scRNA-seq data. `.X` should be a matrix containing raw count data of shape (# cells, # genes). 240 | size_factor_key: key of the size factor stored in `adata.obs`. If not set, you can calculate size factor using `scanpy.pp.normalize_total` as follows. `norm_factor = sc.pp.normalize_total(adata, exclude_highly_expressed=True, max_fraction= 0.2, inplace=False)["norm_factor"]; adata.obs["size_factor"] = norm_factor/norm_factor.mean()` If None, no size factor is used. 241 | covariates: a covariate matrix of shape (# cells, # covariates). Default: None. 242 | num_cluster_list: a list of numbers of clusters to test. Default: [1, 2]. 243 | learning_rate: learning rate of the Adamax optimizer. Default: 0.1. 244 | EM_ITER_MAX: maximum number of iterations of E-step of the EM trick algorithm. Default: 20. 245 | M_ITER_MAX: maximum number of iterations of M-step of the EM trick algorithm. Default: 10000. 246 | LL_diff_tolerance: tolerance of the difference of log-likelihood between two iterations of the EM trick algorithm. Default: 1e-4. 247 | Q_diff_tolerance: tolerance of the difference of Q between two iterations of the EM trick algorithm. Default: 1e-4. 248 | device: device to use. If you want to use GPU set to 'cuda:0'. If you want to use CPU set to 'cpu' Default: 'cuda:0'. 249 | verbose: if True, print the progress of the EM trick algorithm. Default: False. 250 | num_threads: number of threads to use. For each gene, MarcoPolo fits Poisson model to a matrix of (1, # cells). As the matrix is too small for us to fully utilize the power of GPU, it is good to use multiple threads at once. The best option depends on the number of cells and the GPU memory size. For 500 cells and 11GB, using 4 threads worked well. Default: 1. 251 | 252 | Returns: 253 | regression_result: a dictionary containing the regression results with the following fields: `gamma_list_cluster`, `delta_log_cluster`, `beta_cluster`, `result_cluster`. 254 | 255 | """ 256 | if num_threads > 1 and device.startswith('cuda'): 257 | print( 258 | f" Currently, you are using {num_threads} threads for regression. If you encounter any memory issues, try to set `num_threads` to 1.") 259 | 260 | expression_matrix = adata.X # (cell, gene) 261 | num_cells = expression_matrix.shape[0] 262 | num_genes = expression_matrix.shape[1] 263 | 264 | if not type(expression_matrix) == np.ndarray: 265 | expression_matrix = expression_matrix.toarray().astype(float) 266 | else: 267 | expression_matrix = expression_matrix.astype(float) 268 | 269 | if size_factor_key is None: 270 | cell_size_factor = np.ones(expression_matrix.shape[0]).astype(float) 271 | else: 272 | cell_size_factor = adata.obs[size_factor_key].values.astype(float) 273 | 274 | if covariates is None: 275 | covariate_matrix = np.ones((expression_matrix.shape[0], 1)).astype(float) 276 | else: 277 | covariate_matrix = covariates.astype(float) 278 | 279 | fit_one_gene_parameters = {"EM_ITER_MAX": EM_ITER_MAX, "M_ITER_MAX": M_ITER_MAX, 280 | "LL_diff_tolerance": LL_diff_tolerance, "Q_diff_tolerance": Q_diff_tolerance} 281 | 282 | 283 | print(f'The numbers of clusters to test: {num_cluster_list}') 284 | print(f'Y: {expression_matrix.shape} X: {covariate_matrix.shape} s: {cell_size_factor.shape}') 285 | 286 | if num_threads != 1: 287 | multiprocessing.set_start_method('spawn', force=True) 288 | pool = multiprocessing.Pool(processes=num_threads) 289 | 290 | gene_per_thread = expression_matrix.shape[1] // num_threads 291 | gene_thread_split = [(gene_per_thread * i, gene_per_thread * (i + 1)) for i in range(num_threads - 1)] + [ 292 | (gene_per_thread * (num_threads - 1), expression_matrix.shape[1])] 293 | #gene_thread_split=gene_thread_split[::-1] 294 | 295 | #multiprocessing.freeze_support() 296 | 297 | fit_result_thread = pool.starmap(fit_multiple_genes, [(expression_matrix[:, start_gene_idx: end_gene_idx], 298 | covariate_matrix[:], 299 | cell_size_factor[:], 300 | num_cluster_list, 301 | learning_rate, 302 | fit_one_gene_parameters, 303 | device, 304 | None, 305 | None, 306 | verbose) for start_gene_idx, end_gene_idx in 307 | gene_thread_split]) 308 | 309 | pool.close() 310 | 311 | regression_result = {} 312 | 313 | for fit_result_thread in fit_result_thread[:]: 314 | for category, value_cluster in fit_result_thread.items(): 315 | for num_cluster, value in value_cluster.items(): 316 | if isinstance(value, list): 317 | regression_result[category][num_cluster] = regression_result.setdefault(category, {}).get( 318 | num_cluster, []) + value 319 | elif isinstance(value, pd.DataFrame): 320 | regression_result[category][num_cluster] = regression_result.setdefault(category, {}).get( 321 | num_cluster, []) + [value.reset_index()] 322 | else: 323 | raise ValueError("Unknown type of value: {}".format(type(value))) 324 | 325 | for category in regression_result.keys(): 326 | for num_cluster in regression_result[category].keys(): 327 | if isinstance(regression_result[category][num_cluster][0], pd.DataFrame): 328 | regression_result[category][num_cluster] = pd.concat( 329 | regression_result[category][num_cluster]).reset_index() 330 | assert len(regression_result[category][num_cluster])==num_genes, RuntimeError("Length of result is not equal to number of genes.") 331 | 332 | else: 333 | regression_result = fit_multiple_genes(Y=expression_matrix[:, :], 334 | X=covariate_matrix[:], 335 | s=cell_size_factor[:], 336 | num_cluster_list=num_cluster_list, 337 | learning_rate=learning_rate, 338 | fit_one_gene_parameters=fit_one_gene_parameters, 339 | device=device, 340 | verbose=verbose) 341 | 342 | return regression_result 343 | 344 | 345 | if __name__ == '__main__': 346 | data_path = "/homes/gws/chanwkim/MarcoPolo/notebooks/example/hESC.h5ad" 347 | adata = ad.read(data_path) 348 | run_regression(adata=adata[:, :10], size_factor_key="size_factor", num_threads=3, device="cuda:2") 349 | 350 | # from scipy.io import mmread 351 | # import numpy as np 352 | # import pandas as pd 353 | # Y_=mmread('../datasets/koh_extract/koh.data.counts.mm').toarray().astype(float).transpose() 354 | # s_=pd.read_csv('../datasets/analysis/koh.size_factor_cluster.tsv',sep='\t',header=None)[0].values.astype(float)#.reshape(-1,1) 355 | # X_=np.array([np.ones(Y_.shape[0])]).transpose() 356 | 357 | device = torch.device('cuda:2') 358 | 359 | Y_ = np.ones((446, 4898)) 360 | X_ = np.ones((446, 1)) 361 | s_ = np.ones((446)) 362 | 363 | fit_multiple_genes(Y_select=Y_, X=X_, s=s_, num_cluster_list=[1, 2, 3], LR=0.1, EM_ITER_MAX=20, M_ITER_MAX=10000, 364 | LL_diff_tolerance=1e-4, Q_diff_tolerance=1e-4, device=device, verbose=True) 365 | -------------------------------------------------------------------------------- /MarcoPolo/summarization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/summarization/__init__.py -------------------------------------------------------------------------------- /MarcoPolo/summarization/summarizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import anndata as ad 4 | 5 | import warnings 6 | 7 | from sklearn import preprocessing 8 | from sklearn.decomposition import PCA 9 | 10 | import MarcoPolo.utils 11 | 12 | pd.options.mode.chained_assignment = None 13 | 14 | def find_markers(adata: ad.AnnData, regression_result: dict, mode: float=2, voting_thres: float=0.7, PCA_norm_thres:float=10, num_PC:int=2, log_fold_change_thres:float=0.6, 15 | oncell_size_min_count:float=10, oncell_size_max_proportion:float=70)->pd.DataFrame: 16 | """ 17 | find markers from the regression result 18 | Args: 19 | adata: anndata.AnnData containing scRNA-seq data. `.X` should be a matrix containing raw count data of shape (# cells, # genes). 20 | regression_result: dict containing regression results. Return value of `run_regression` function. 21 | mode: the number of groups to be used for marker selection. Default: 2. 22 | voting_thres: the threshold for voting. should be between 0 and 1. Default: 0.7. 23 | PCA_norm_thres: the threshold for PCA normalization. Default: 10. 24 | num_PC: the number of PCs to be used for marker selection. should be between 1 and 50 Default: 2. 25 | log_fold_change_thres: the threshold for log fold change. Default: 0.6. 26 | oncell_size_min_count: the minimum number of cells in on-cell group. Default: 10. 27 | oncell_size_max_proportion: the maximum proportion of cells in on-cell group. Default: 70. 28 | 29 | Returns: 30 | gene_scores: a pandas.DataFrame containing the following columns: 'MarcoPolo_rank', 'bimodality_score', 'voting_score', 'proximity_score', etc. 31 | 32 | """ 33 | expression_matrix = adata.X.copy() 34 | num_cells=expression_matrix.shape[0] 35 | num_genes=expression_matrix.shape[1] 36 | 37 | ######################## 38 | # Assign cells to on-cells and off-cells 39 | ######################## 40 | print("Assign cells to on-cells and off-cells...") 41 | gamma_list = regression_result["gamma_list_cluster"][mode] 42 | gamma_argmax_list = MarcoPolo.utils.gamma_list_expression_matrix_to_gamma_argmax_list(gamma_list, expression_matrix) 43 | 44 | ######################## 45 | # Calculate log fold change 46 | ######################## 47 | with warnings.catch_warnings(): 48 | warnings.simplefilter("ignore", category=RuntimeWarning) 49 | log_fold_change = np.log10(np.array([np.mean(expression_matrix[gamma_argmax_list[i] == 0, i]) for i in 50 | range(num_genes)]) / np.array( 51 | [np.mean(expression_matrix[gamma_argmax_list[i] != 0, i]) for i in range(num_genes)])) 52 | 53 | ######################## 54 | # Calculate voting score 55 | ######################## 56 | print("Calculating voting score...") 57 | oncell_size_list = np.sum(gamma_argmax_list == 0, axis=1) 58 | oncell_size_cliplist = MarcoPolo.utils.gamma_argmax_list_to_oncell_size_list_list(gamma_argmax_list) 59 | intersection_list = MarcoPolo.utils.gamma_argmax_list_to_intersection_list(gamma_argmax_list) 60 | #intersectioncount_prop=((intersection_list/oncell_size_cliplist)) 61 | #intersectioncount_prop_top10=[np.arange(0,len(i))[i>=sorted(i)[-10]][:10] for i in intersectioncount_prop] 62 | intersectioncount_threshold = ((intersection_list / oncell_size_cliplist) > voting_thres) 63 | voting_score = np.sum(intersectioncount_threshold, axis=1) 64 | 65 | ######################## 66 | # Calculate proximity score 67 | ######################## 68 | print("Calculating proximity score...") 69 | expression_matrix_norm = np.log1p(10000 * expression_matrix / expression_matrix.sum(axis=1, keepdims=True)) 70 | #expression_matrix_norm_scale = preprocessing.scale(expression_matrix_norm, axis=0, with_mean=True, with_std=True, copy=True) 71 | expression_matrix_norm_scale=(expression_matrix_norm-expression_matrix_norm.mean(axis=0, keepdims=True))/expression_matrix_norm.std(axis=0, keepdims=True) 72 | expression_matrix_norm_scale[expression_matrix_norm_scale > PCA_norm_thres] = PCA_norm_thres 73 | 74 | pca = PCA(n_components=50) 75 | pca.fit(expression_matrix_norm_scale) 76 | expression_matrix_norm_scale_pc = pca.transform(expression_matrix_norm_scale) 77 | 78 | proximity_score = np.array( 79 | [expression_matrix_norm_scale_pc[gamma_argmax_list[i] == 0, :num_PC].std(axis=0).mean() for i in 80 | range(num_genes)]) 81 | 82 | ######################## 83 | # Calculate bimodality score 84 | ######################## 85 | print("Calculating bimodality score...") 86 | QQratio = regression_result["result_cluster"][1]['Q'] / regression_result["result_cluster"][mode]['Q'] 87 | mean_all = np.array([np.mean(expression_matrix[:, i]) for i in range(num_genes)]) 88 | 89 | with warnings.catch_warnings(): 90 | warnings.simplefilter("ignore", category=RuntimeWarning) 91 | mean_on = np.array( 92 | [np.mean(expression_matrix[gamma_argmax_list[i] == 0, i]) for i in range(num_genes)]) 93 | MS = mean_on - mean_all 94 | 95 | ######################## 96 | # Final step of obtaining MarcoPolo score 97 | ######################## 98 | print("Calculating MarcoPolo score...") 99 | gene_scores = pd.DataFrame([QQratio.values, 100 | voting_score, 101 | proximity_score, 102 | log_fold_change, 103 | MS, 104 | oncell_size_list,], 105 | index=['QQratio', 106 | 'voting_score', 107 | 'proximity_score', 108 | 'log_fold_change', 109 | 'MS', 110 | 'oncell_size']).T 111 | 112 | gene_scores['QQratio_rank'] = \ 113 | pd.Series(np.arange(num_genes), index=gene_scores['QQratio'].sort_values(ascending=False).index).loc[ 114 | gene_scores.index] 115 | 116 | gene_scores['voting_score_rank'] = \ 117 | pd.Series(np.arange(num_genes), 118 | index=gene_scores['voting_score'].sort_values(ascending=False).index).loc[ 119 | gene_scores.index] 120 | gene_scores['voting_score_rank'][gene_scores['voting_score'] == 0] = 499999 121 | gene_scores['voting_score_rank'][gene_scores['voting_score'] == 1] = 999999 122 | 123 | gene_scores['proximity_score_rank'] = \ 124 | pd.Series(np.arange(num_genes), index=gene_scores['proximity_score'].sort_values(ascending=True).index).loc[ 125 | gene_scores.index] 126 | 127 | gene_scores['log_fold_change_rank'] = \ 128 | pd.Series(np.arange(num_genes), index=gene_scores['log_fold_change'].sort_values(ascending=False).index).loc[ 129 | gene_scores.index] 130 | 131 | gene_scores['MS_rank'] = \ 132 | pd.Series(np.arange(num_genes), index=gene_scores['MS'].sort_values(ascending=False).index).loc[ 133 | gene_scores.index] 134 | 135 | gene_scores['oncell_size_rank'] = \ 136 | pd.Series(np.arange(num_genes), index=gene_scores['oncell_size'].sort_values(ascending=False).index).loc[ 137 | gene_scores.index] 138 | 139 | # Exclude outliers genes from ranking. 140 | gene_scores['voting_score_rank'][~( 141 | (gene_scores['log_fold_change'] > log_fold_change_thres) & 142 | (gene_scores['oncell_size'] > int(oncell_size_min_count)) & 143 | (gene_scores['oncell_size'] < int(oncell_size_max_proportion / 100 * num_cells)) 144 | )] = len(gene_scores) 145 | 146 | gene_scores['bimodality_score_rank'] = gene_scores[['QQratio_rank', 'MS_rank']].min(axis=1).astype(int) 147 | gene_scores['bimodality_score_rank'][~( 148 | (gene_scores['log_fold_change'] > log_fold_change_thres) & 149 | (gene_scores['oncell_size'] > int(oncell_size_min_count)) & 150 | (gene_scores['oncell_size'] < int(oncell_size_max_proportion / 100 * num_cells)) 151 | )] = len(gene_scores) 152 | 153 | gene_scores['proximity_score_rank'] = gene_scores['proximity_score_rank'].copy().astype(int) 154 | gene_scores['proximity_score_rank'][~( 155 | (gene_scores['log_fold_change'] > log_fold_change_thres) & 156 | (gene_scores['oncell_size'] > int(oncell_size_min_count)) & 157 | (gene_scores['oncell_size'] < int(oncell_size_max_proportion / 100 * num_cells)) 158 | )] = len(gene_scores) 159 | 160 | MarcoPolo_score = gene_scores[['voting_score_rank', 'proximity_score_rank', 'bimodality_score_rank']].min(axis=1) 161 | 162 | gene_scores['MarcoPolo'] = MarcoPolo_score 163 | gene_scores['MarcoPolo_rank'] = pd.Series(np.arange(gene_scores.shape[0]), 164 | index=gene_scores.sort_values(['MarcoPolo', 'log_fold_change'], 165 | ascending=[True, False]).index).loc[ 166 | gene_scores.index] 167 | 168 | gene_scores = gene_scores.reindex(sorted(gene_scores.columns), axis=1) 169 | 170 | return gene_scores 171 | 172 | 173 | -------------------------------------------------------------------------------- /MarcoPolo/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def gamma_argmax_list_to_oncell_size_list_list(gamma_argmax_list: np.ndarray)->np.ndarray: 9 | """ 10 | 11 | Args: 12 | gamma_argmax_list: List of gamma_argmax. 13 | 14 | Returns: 15 | np.ndarray: List of oncell size. 16 | 17 | """ 18 | oncellsize_list=np.sum(gamma_argmax_list==0,axis=1) 19 | # print(oncellsize_list) 20 | oncellsize_list_list=[np.clip(oncellsize_list, a_min=0, a_max=oncellsize) for oncellsize in oncellsize_list] 21 | 22 | return np.array(oncellsize_list_list) 23 | 24 | 25 | def gamma_argmax_list_to_intersection(gamma_argmax_list, idx): 26 | """ 27 | 28 | Args: 29 | gamma_argmax_list: List of gamma_argmax. 30 | idx: Index of gamma_argmax_list. 31 | 32 | Returns: 33 | np.ndarray: Intersection. 34 | 35 | """ 36 | intersection = np.sum((gamma_argmax_list[idx] == gamma_argmax_list) & (gamma_argmax_list[idx] == 0), axis=1) 37 | return intersection 38 | 39 | def gamma_argmax_list_to_intersection_list(gamma_argmax_list: np.ndarray)->np.ndarray: 40 | """ 41 | 42 | Args: 43 | gamma_argmax_list: List of gamma_argmax. 44 | 45 | Returns: 46 | np.ndarray: List of intersection. 47 | 48 | """ 49 | 50 | pool=multiprocessing.Pool(processes=16) 51 | 52 | intersection_list=pool.starmap(gamma_argmax_list_to_intersection,[(gamma_argmax_list,i) for i in np.arange(gamma_argmax_list.shape[0])]) 53 | 54 | pool.close() 55 | pool.join() 56 | 57 | return np.array(intersection_list) 58 | 59 | def gamma_expression_to_gamma_argmax(gamma: np.ndarray, expression: np.ndarray = None) -> np.ndarray: 60 | """ 61 | 62 | Args: 63 | gamma: A gamma matrix. 64 | expression: If expression is not None, it is used to calculate the which group has higher expression mean. 65 | 66 | Returns: 67 | np.ndarray: gamma_argmax. 68 | 69 | """ 70 | gamma_argmax = np.argmax(gamma, axis=1) 71 | gamma_argmax_counts = list(np.unique(gamma_argmax, return_counts=True)) 72 | if expression is None: 73 | key_newkey = pd.DataFrame(gamma_argmax_counts, index=['idx', 'counts']).T.set_index('idx').sort_values( 74 | by='counts', ascending=True).index.tolist() 75 | else: 76 | with warnings.catch_warnings(): 77 | warnings.simplefilter("ignore", category=RuntimeWarning) 78 | gamma_argmax_counts_lfc = gamma_argmax_counts + [(list( 79 | map(lambda x: np.mean(expression[gamma_argmax == x], axis=0) - np.mean( 80 | expression[gamma_argmax != x], axis=0), gamma_argmax_counts[0])))] 81 | key_newkey = pd.DataFrame(gamma_argmax_counts_lfc, index=['idx', 'counts', 'lfc']).T.astype( 82 | {'idx': int, 'counts': int}).set_index('idx').sort_values(by='lfc', ascending=False).index.tolist() 83 | gamma_argmax = [key_newkey.index(argmax) for argmax in gamma_argmax] 84 | 85 | return gamma_argmax 86 | 87 | def gamma_list_expression_matrix_to_gamma_argmax_list(gamma_list: np.ndarray, expression_matrix: np.ndarray=None)->np.ndarray: 88 | """ 89 | 90 | Args: 91 | gamma_list: List of gamma matrices. 92 | expression_matrix: If expression_matrix is not None, it is used to calculate the which group has higher expression mean. 93 | 94 | Returns: 95 | List of gamma_argmax. 96 | 97 | """ 98 | pool=multiprocessing.Pool(processes=16) 99 | 100 | if expression_matrix is None: 101 | gamma_argmax_list=pool.map(gamma_expression_to_gamma_argmax, gamma_list) 102 | else: 103 | gamma_argmax_list=pool.starmap(gamma_expression_to_gamma_argmax, [(gamma_list[i], expression_matrix[:, i]) for i in range(len(gamma_list))]) 104 | 105 | pool.close() 106 | pool.join() 107 | 108 | return np.array(gamma_argmax_list) -------------------------------------------------------------------------------- /MarcoPolo/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/__init__.py -------------------------------------------------------------------------------- /MarcoPolo/visualization/reporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import anndata as ad 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | from jinja2 import Template 10 | from tqdm import tqdm 11 | from typing import Union, List, Tuple 12 | from pathlib import Path 13 | import MarcoPolo.utils 14 | 15 | def get_discrete_palette(n, palette=None): 16 | """ 17 | Porting of https://github.com/satijalab/seurat/blob/b51801bc4b1a66aed5456473c9fe0be884994c93/R/visualization.R#L2686 18 | Generate a list of colors that are well separated with one another. 19 | 20 | :param int n: number of colors 21 | 22 | :return numpy.array: list of colors 23 | """ 24 | palettes={ 25 | 'alphabet':[ 26 | "#F0A0FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", 27 | "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", 28 | "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", 29 | "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", 30 | "#FFE100", "#FF5005" 31 | ], 32 | 'alphabet2':[ 33 | "#AA0DFE", "#3283FE", "#85660D", "#782AB6", "#565656", "#1C8356", 34 | "#16FF32", "#F7E1A0", "#E2E2E2", "#1CBE4F", "#C4451C", "#DEA0FD", 35 | "#FE00FA", "#325A9B", "#FEAF16", "#F8A19F", "#90AD1C", "#F6222E", 36 | "#1CFFCE", "#2ED9FF", "#B10DA1", "#C075A6", "#FC1CBF", "#B00068", 37 | "#FBE426", "#FA0087" 38 | ], 39 | 'glasbey':[ 40 | "#0000FF", "#FF0000", "#00FF00", "#000033", "#FF00B6", "#005300", 41 | "#FFD300", "#009FFF", "#9A4D42", "#00FFBE", "#783FC1", "#1F9698", 42 | "#FFACFD", "#B1CC71", "#F1085C", "#FE8F42", "#DD00FF", "#201A01", 43 | "#720055", "#766C95", "#02AD24", "#C8FF00", "#886C00", "#FFB79F", 44 | "#858567", "#A10300", "#14F9FF", "#00479E", "#DC5E93", "#93D4FF", 45 | "#004CFF", "#F2F318" 46 | ], 47 | 'polychrome':[ 48 | "#5A5156", "#E4E1E3", "#F6222E", "#FE00FA", "#16FF32", "#3283FE", 49 | "#FEAF16", "#B00068", "#1CFFCE", "#90AD1C", "#2ED9FF", "#DEA0FD", 50 | "#AA0DFE", "#F8A19F", "#325A9B", "#C4451C", "#1C8356", "#85660D", 51 | "#B10DA1", "#FBE426", "#1CBE4F", "#FA0087", "#FC1CBF", "#F7E1A0", 52 | "#C075A6", "#782AB6", "#AAF400", "#BDCDFF", "#822E1C", "#B5EFB5", 53 | "#7ED7D1", "#1C7F93", "#D85FF7", "#683B79", "#66B0FF", "#3B00FB" 54 | ], 55 | 'stepped':[ 56 | "#990F26", "#B33E52", "#CC7A88", "#E6B8BF", "#99600F", "#B3823E", 57 | "#CCAA7A", "#E6D2B8", "#54990F", "#78B33E", "#A3CC7A", "#CFE6B8", 58 | "#0F8299", "#3E9FB3", "#7ABECC", "#B8DEE6", "#3D0F99", "#653EB3", 59 | "#967ACC", "#C7B8E6", "#333333", "#666666", "#999999", "#CCCCCC" 60 | ] 61 | } 62 | if palette is None: 63 | if n<=26: 64 | palette="alphabet" 65 | elif n<=32: 66 | palette="glasbey" 67 | else: 68 | palette="polychrome" 69 | 70 | palette_array= palettes[palette] 71 | assert n<=len(palette_array), "Not enough colours in specified palette" 72 | 73 | return np.array(palette_array)[np.arange(n)] 74 | 75 | 76 | def annotate_gene_info(gene_scores, gene_query_list, gene_info, by): 77 | """ 78 | Annotate gene_scores matrix with gene info. 79 | """ 80 | 81 | gene_scores = gene_scores.copy() 82 | gene_info_select_list = [] 83 | 84 | column_list = ['Symbol', 'description', 'Other_designations', 'type_of_gene', 'dbXrefs'] 85 | 86 | not_matched_list = [] 87 | pbar=tqdm(gene_query_list) 88 | for idx, query in enumerate(pbar): 89 | if by == 'ID': 90 | gene_info_select = gene_info[gene_info['dbXrefs'].str.contains(query, regex=False)] 91 | else: 92 | gene_info_select = gene_info[gene_info['Symbol'].str.lower() == query.lower()] 93 | if len(gene_info_select) == 0: 94 | gene_info_select = gene_info[gene_info['Synonyms'].str.lower().str.contains(query.lower(), regex=False)] 95 | 96 | if len(gene_info_select) >= 1: 97 | gene_info_select_list.append(gene_info_select[column_list].iloc[0]) 98 | else: 99 | gene_info_select_list.append(pd.Series(index=column_list, dtype=float)) 100 | not_matched_list.append(query) 101 | #pbar.set_description(f"Number of genes unmatched: {len(not_matched_list)}/ {len(gene_query_list)}") 102 | pbar.set_postfix({'Num. of unmatched genes': len(not_matched_list)}) 103 | print(f"{len(not_matched_list)} not matched genes: {', '.join(not_matched_list[:20])+ ', ...' if len(not_matched_list) > 20 else ', '.join(not_matched_list)}") 104 | gene_info_extract = pd.DataFrame(gene_info_select_list, index=np.arange(len(gene_query_list))) 105 | 106 | 107 | assert len(gene_info_extract) == len(gene_scores), "gene_info_extract and gene_scores have different length" 108 | gene_scores = gene_scores.merge(right=gene_info_extract, left_index=True, right_index=True) 109 | 110 | return gene_scores 111 | 112 | 113 | def generate_html_file(output_dir, gene_scores, num_genes, num_cells, top_num_html=1000): 114 | os.makedirs('{}'.format(output_dir), exist_ok=True) 115 | os.makedirs('{}/plot_image'.format(output_dir), exist_ok=True) 116 | os.makedirs('{}/assets'.format(output_dir), exist_ok=True) 117 | 118 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/scripts.js'), 119 | '{}/assets/scripts.js'.format(output_dir)) 120 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/styles.css'), 121 | '{}/assets/styles.css'.format(output_dir)) 122 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/details_open.png'), 123 | '{}/assets/details_open.png'.format(output_dir)) 124 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/details_close.png'), 125 | '{}/assets/details_close.png'.format(output_dir)) 126 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/mp.png'), 127 | '{}/assets/mp.png'.format(output_dir)) 128 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/mp_white.png'), 129 | '{}/assets/mp_white.png'.format(output_dir)) 130 | shutil.copy(os.path.join(os.path.dirname(__file__), 'template/assets/mp_white_large_font.png'), 131 | '{}/assets/mp_white_large_font.png'.format(output_dir)) 132 | 133 | with open(os.path.join(os.path.dirname(__file__), 'template/index.html'), 'r') as f: 134 | template_read = f.read() 135 | template = Template(source=template_read) 136 | 137 | MarcoPolo_table = gene_scores.sort_values("MarcoPolo_rank", ascending=True).set_index('MarcoPolo_rank').iloc[ 138 | :top_num_html] 139 | MarcoPolo_table.index += 1 140 | MarcoPolo_table = MarcoPolo_table.to_html(classes="table table-bordered", table_id='dataTable') 141 | 142 | MarcoPolo_table = MarcoPolo_table.replace('=sorted(i)[-10]][:10] for i in intersectioncount_prop] 330 | 331 | gene_scores_munge['Voting_genes_top10']=[gene_scores_munge['Gene ID'][i].values for i in intersectioncount_prop_top10] 332 | gene_scores_munge_voting=gene_scores_munge.copy() 333 | 334 | ######################## 335 | # Annotate gene_scores with gene info 336 | ######################## 337 | if gene_info_path is not None: 338 | print(f"Annotating genes with the gene info: {gene_info_path}") 339 | 340 | gene_info=pd.read_csv(gene_info_path,sep='\t') 341 | #import ipdb 342 | #ipdb.set_trace() 343 | by='ID' if 'ENS' in adata.var.index.values.tolist()[0] else 'name' 344 | gene_scores_munge=annotate_gene_info(gene_scores=gene_scores_munge, gene_query_list=adata.var.index.values.tolist(), gene_info=gene_info, by=by) 345 | 346 | gene_scores_munge['Log2FC']=(gene_scores_munge['log_fold_change']/np.log10(2)).round(2) 347 | 348 | gene_scores_munge=gene_scores_munge[[ 349 | 'MarcoPolo_rank', 350 | 'Gene ID', 351 | 'Symbol', 'description', 'Other_designations', 'type_of_gene', 352 | 'Log2FC', 353 | 'MarcoPolo', 354 | 'bimodality_score_rank', 355 | 'voting_score_rank', 356 | 'proximity_score_rank', 357 | 'oncell_size','oncell_size_rank', 358 | 'dbXrefs' 359 | ]] 360 | gene_scores_munge['img'] = gene_scores_munge.apply(lambda x: '{idx}'.format(idx=x.name), axis=1) 361 | 362 | 363 | else: 364 | gene_scores_munge['Log2FC'] = (gene_scores_munge['log_fold_change'] / np.log10(2)).round(2) 365 | 366 | gene_scores_munge=gene_scores_munge[[ 367 | 'MarcoPolo_rank', 368 | 'Gene ID', 369 | 'Log2FC', 370 | 'MarcoPolo', 371 | 'bimodality_score_rank', 372 | 'voting_score_rank', 373 | 'proximity_score_rank', 374 | 'oncell_size','oncell_size_rank', 375 | ]] 376 | gene_scores_munge['img']=gene_scores_munge.apply(lambda x: '{idx}'.format(idx=x.name),axis=1) 377 | 378 | # import ipdb; ipdb.set_trace() 379 | ######################## 380 | # Generate table files 381 | ######################## 382 | print(f"Generating table files...") 383 | generate_html_file(output_dir=output_dir, gene_scores=gene_scores_munge, num_genes=num_genes, num_cells=num_cells, top_num_html=top_num_html) 384 | gene_scores_munge_voting[['Gene ID','Voting_genes_top10']].to_html('{}/voting_result.html'.format(output_dir)) 385 | gene_scores_munge.to_csv('{}.table.tsv'.format(output_dir), sep='\t') 386 | 387 | ######################## 388 | # Generate image files 389 | ######################## 390 | print(f"Generating image files...") 391 | generate_image_files(adata=adata, size_factor_key=size_factor_key, gamma_argmax_list=gamma_argmax_list, gene_scores=gene_scores, 392 | low_dim_key=low_dim_key, output_dir=output_dir, top_num_image=top_num_image, 393 | cell_color_key=cell_color_key, 394 | **plot_parameters) 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/details_close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/template/assets/details_close.png -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/details_open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/template/assets/details_open.png -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/mp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/template/assets/mp.png -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/mp_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/template/assets/mp_white.png -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/mp_white_large_font.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/MarcoPolo/visualization/template/assets/mp_white_large_font.png -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/assets/scripts.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Start Bootstrap - SB Admin v6.0.1 (https://startbootstrap.com/templates/sb-admin) 3 | * Copyright 2013-2020 Start Bootstrap 4 | * Licensed under MIT (https://github.com/StartBootstrap/startbootstrap-sb-admin/blob/master/LICENSE) 5 | */ 6 | (function($) { 7 | "use strict"; 8 | 9 | // Add active state to sidbar nav links 10 | var path = window.location.href; // because the 'href' property of the DOM element is the absolute path 11 | $("#layoutSidenav_nav .sb-sidenav a.nav-link").each(function() { 12 | if (this.href === path) { 13 | $(this).addClass("active"); 14 | } 15 | }); 16 | 17 | // Toggle the side navigation 18 | $("#sidebarToggle").on("click", function(e) { 19 | e.preventDefault(); 20 | $("body").toggleClass("sb-sidenav-toggled"); 21 | }); 22 | })(jQuery); 23 | 24 | function htmlDecode(input){ 25 | var e = document.createElement('textarea'); 26 | e.innerHTML = input; 27 | // handle case of empty input 28 | return e.childNodes.length === 0 ? "" : e.childNodes[0].nodeValue; 29 | } 30 | 31 | function format ( d ) { 32 | // `d` is the original data object for the row 33 | return '
'+ 34 | ''+ 35 | ''+ 36 | ''+ 37 | '
'+htmlDecode(d[d.length-1])+'
'; 38 | } 39 | 40 | 41 | // Call the dataTables jQuery plugin 42 | $(document).ready(function() { 43 | var table =$('#dataTable').DataTable({ 44 | "pageLength": 50, 45 | "order": [[ 0, "asc" ]], 46 | "columnDefs": [{ 47 | "targets": [ -1 ], 48 | "visible": false, 49 | "searchable": false 50 | }] 51 | }); 52 | 53 | 54 | $('#dataTable tbody').on('click', 'tr', function () { 55 | var tr = $(this).closest('tr'); 56 | var row = table.row( tr ); 57 | 58 | if ( row.child.isShown() ) { 59 | // This row is already open - close it 60 | row.child.hide(); 61 | tr.removeClass('shown'); 62 | } 63 | else { 64 | // Open this row 65 | //row.child('dsdsd').show();//format(row.data()) 66 | row.child(format(row.data()) ).show();//format(r) 67 | tr.addClass('shown'); 68 | } 69 | } ); 70 | 71 | }); 72 | 73 | -------------------------------------------------------------------------------- /MarcoPolo/visualization/template/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 | 18 | MarcoPolo 19 | 20 | 21 | 22 | 23 | 30 | 31 | 32 | 66 |
67 |
68 | 144 |
145 |
146 |
147 |
148 | 149 |
150 |

Analysis Report

151 | 152 |
153 |
154 | 155 |
156 |
157 | Number of cells : {{num_cell}}
158 | Number of genes : {{num_gene}} 159 |
160 |
161 | 162 | 163 | 166 | 167 | 168 | 208 | 230 |
231 |
232 | 233 | List of Genes 234 |
235 |
236 |
237 | {{MarcoPolo_table}} 238 | 719 |
720 |
721 |
722 |
723 |
724 | 736 |
737 |
738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ![MarcoPolo](assets/mp.png) 4 | 5 | MarcoPolo is a method to discover differentially expressed genes in single-cell RNA-seq data without depending on prior clustering 6 | 7 | [![Stars](https://img.shields.io/github/stars/chanwkimlab/marcopolo?logo=GitHub&color=yellow)](https://github.com/chanwkim/marcopolo/stargazers) 8 | [![PyPI](https://img.shields.io/pypi/v/marcopolo-pytorch.svg)](https://pypi.org/project/marcopolo-pytorch) 9 | [![Downloads](https://pepy.tech/badge/marcopolo-pytorch)](https://pepy.tech/project/marcopolo-pytorch) 10 | 11 | ## Overview 12 | 13 | 14 | 15 | 16 | 17 | `MarcoPolo` is a novel clustering-independent approach to identifying DEGs in scRNA-seq data. MarcoPolo identifies informative DEGs without depending on prior clustering, and therefore is robust to uncertainties from clustering or cell type assignment. Since DEGs are identified independent of clustering, one can utilize them to detect subtypes of a cell population that are not detected by the standard clustering, or one can utilize them to augment HVG methods to improve clustering. An advantage of our method is that it automatically learns which cells are expressed and which are not by fitting the bimodal distribution. Additionally, our framework provides analysis results in the form of an HTML file so that researchers can conveniently visualize and interpret the results. 18 | 19 | 20 | |Datasets|URL| 21 | |:---|:---| 22 | |Human liver cells (MacParland et al.)|[https://chanwkimlab.github.io/MarcoPolo/HumanLiver/](https://chanwkimlab.github.io/MarcoPolo/HumanLiver/)| 23 | |Human embryonic stem cells (The Koh et al.)|[https://chanwkimlab.github.io/MarcoPolo/hESC/](https://chanwkimlab.github.io/MarcoPolo/hESC/)| 24 | |Peripheral blood mononuclear cells (Zheng et al.)|[https://chanwkimlab.github.io/MarcoPolo/Zhengmix8eq/](https://chanwkimlab.github.io/MarcoPolo/Zhengmix8eq/)| 25 | 26 | 27 | ## Preparing dataset 28 | MarcoPolo works jointly with [AnnData](https://anndata.readthedocs.io/), a flexible and efficient data format for scRNA-seq data widely used in python community. This enables MarcoPolo to seamlessly work with other popular single cell software packages such as [scanpy](https://scanpy.readthedocs.io/), or more broadly, other packages included in the [scverse](https://scverse.org/projects/) project, etc as they also work based on AnnData. 29 | 30 | You should prepare your scRNA-seq data in AnnData object before running MarcoPolo. 31 | Please refer to the [AnnData's Getting started page](https://anndata-tutorials.readthedocs.io/en/latest/getting-started.html) for more information about AnnData. 32 | If your data is in seurat object, you can very easily convert it to AnnData following the instructions [here](https://satijalab.org/seurat/articles/conversion_vignette.html). 33 | 34 | As MarcoPolo runs on raw count data, anndata should contain the raw count data in `.X`. The structure of Anndata is described [here](https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.html). 35 | 36 | ## Running MarcoPolo with Google Colab 37 | You can easily try MarcoPolo with Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chanwkimlab/MarcoPolo/blob/main/notebooks/MarcoPolo.ipynb) 38 | 39 | Google colab is a free cloud environment for running Python code. Colab allows you to execute MarcoPolo in your browser without any configurations and GPU resources. 40 | 41 | ## Running MarcoPolo with your local machine 42 | ### How to install MarcoPolo 43 | We recommend using the following pipeline to install MarcoPolo. 44 | 1. Anaconda 45 | 46 | Please refer to https://docs.anaconda.com/anaconda/install/linux/ to install Anaconda. 47 | Then, please make a new conda environment and activate it. 48 | ``` 49 | conda create -n MarcoPolo python=3.8 50 | conda activate MarcoPolo 51 | ``` 52 | 53 | 2. Pytorch 54 | 55 | Please install `PyTorch` from https://pytorch.org/ (If you want to install CUDA-supported PyTorch, please install CUDA in advance) 56 | 57 | 3. MarcoPolo 58 | 59 | You can simply install MarcoPolo by using the `pip` command: 60 | ```bash 61 | pip install marcopolo-pytorch 62 | ``` 63 | If MarcoPolo installed on your machine is outdated, you can get an updated version of MarcoPolo by using the `pip` command: 64 | ```bash 65 | pip install marcopolo-pytorch --upgrade 66 | ``` 67 | 68 | ### How to run MarcoPolo 69 | Please refer to this [notebook](https://github.com/chanwkimlab/MarcoPolo/blob/main/notebooks/MarcoPolo.ipynb) for the usage of MarcoPolo. 70 | 71 | ## Citation 72 | 73 | If you use any part of this code or our data, please cite our 74 | [paper](https://doi.org/10.1093/nar/gkac216). 75 | 76 | ``` 77 | @article{kim2022marcopolo, 78 | title={MarcoPolo: a method to discover differentially expressed genes in single-cell RNA-seq data without depending on prior clustering}, 79 | author={Kim, Chanwoo and Lee, Hanbin and Jeong, Juhee and Jung, Keehoon and Han, Buhm}, 80 | journal={Nucleic Acids Research}, 81 | year={2022} 82 | } 83 | ``` 84 | 85 | ## Contact 86 | If you have any inquiries, please feel free to contact 87 | - [Chanwoo Kim](https://chanwoo.kim) (Paul G. Allen School of Computer Science & Engineering @ the University of 88 | Washington) -------------------------------------------------------------------------------- /assets/mp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/assets/mp.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/assets/overview.png -------------------------------------------------------------------------------- /notebooks/MarcoPolo.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","metadata":{"id":"vs9UoZ6URwgl"},"source":["[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chanwkimlab/MarcoPolo/blob/main/notebooks/tutorial.ipynb)"]},{"cell_type":"markdown","metadata":{"pycharm":{"name":"#%% md\n"},"id":"gA_KZBsPRwgm"},"source":["# Setup"]},{"cell_type":"markdown","metadata":{"pycharm":{"name":"#%%\n"},"id":"bMTZ-QvYRwgn"},"source":["**Start the colab kernel with GPU**: Runtime -> Change runtime type -> GPU"]},{"cell_type":"markdown","metadata":{"pycharm":{"name":"#%%\n"},"id":"bL93Y8aYRwgn"},"source":["## Install dependencies"]},{"cell_type":"code","execution_count":null,"metadata":{"pycharm":{"name":"#%%\n"},"scrolled":false,"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"kFTpg30WRwgn","executionInfo":{"status":"ok","timestamp":1655446154392,"user_tz":420,"elapsed":27463,"user":{"displayName":"","userId":""}},"outputId":"4b517f3e-ed68-4b75-b8b2-633727692c38"},"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting marcopolo-pytorch\n"," Downloading marcopolo_pytorch-1.0.6-py3-none-any.whl (614 kB)\n","\u001b[K |████████████████████████████████| 614 kB 5.1 MB/s \n","\u001b[?25hCollecting anndata>=0.7.4\n"," Downloading anndata-0.8.0-py3-none-any.whl (96 kB)\n","\u001b[K |████████████████████████████████| 96 kB 4.6 MB/s \n","\u001b[?25hRequirement already satisfied: pandas>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (1.3.5)\n","Requirement already satisfied: rpy2>=3.4.2 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (3.4.5)\n","Requirement already satisfied: ipywidgets>=7.5.1 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (7.7.0)\n","Collecting einops>=0.3\n"," Downloading einops-0.4.1-py3-none-any.whl (28 kB)\n","Requirement already satisfied: Jinja2>=2.11.2 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (2.11.3)\n","Collecting scanpy>=1.9.0\n"," Downloading scanpy-1.9.1-py3-none-any.whl (2.0 MB)\n","\u001b[K |████████████████████████████████| 2.0 MB 76.1 MB/s \n","\u001b[?25hRequirement already satisfied: torch>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (1.11.0+cu113)\n","Requirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (1.0.2)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (4.64.0)\n","Collecting scipy>=1.6.1\n"," Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)\n","\u001b[K |████████████████████████████████| 38.1 MB 15.2 MB/s \n","\u001b[?25hCollecting matplotlib>=3.3.0\n"," Downloading matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)\n","\u001b[K |████████████████████████████████| 11.2 MB 70.9 MB/s \n","\u001b[?25hRequirement already satisfied: seaborn>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (0.11.2)\n","Requirement already satisfied: numpy>=1.19.2 in /usr/local/lib/python3.7/dist-packages (from marcopolo-pytorch) (1.21.6)\n","Requirement already satisfied: h5py>=3 in /usr/local/lib/python3.7/dist-packages (from anndata>=0.7.4->marcopolo-pytorch) (3.1.0)\n","Requirement already satisfied: typing_extensions in /usr/local/lib/python3.7/dist-packages (from anndata>=0.7.4->marcopolo-pytorch) (4.2.0)\n","Requirement already satisfied: importlib_metadata>=0.7 in /usr/local/lib/python3.7/dist-packages (from anndata>=0.7.4->marcopolo-pytorch) (4.11.4)\n","Requirement already satisfied: packaging>=20 in /usr/local/lib/python3.7/dist-packages (from anndata>=0.7.4->marcopolo-pytorch) (21.3)\n","Requirement already satisfied: natsort in /usr/local/lib/python3.7/dist-packages (from anndata>=0.7.4->marcopolo-pytorch) (5.5.0)\n","Requirement already satisfied: cached-property in /usr/local/lib/python3.7/dist-packages (from h5py>=3->anndata>=0.7.4->marcopolo-pytorch) (1.5.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata>=0.7->anndata>=0.7.4->marcopolo-pytorch) (3.8.0)\n","Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (1.1.0)\n","Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (3.6.0)\n","Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (0.2.0)\n","Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (4.10.1)\n","Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (5.5.0)\n","Requirement already satisfied: traitlets>=4.3.1 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (5.1.1)\n","Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.5.1->marcopolo-pytorch) (5.4.0)\n","Requirement already satisfied: jupyter-client in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->marcopolo-pytorch) (5.3.5)\n","Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->marcopolo-pytorch) (5.1.1)\n","Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.8.1)\n","Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (1.0.18)\n","Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (2.6.1)\n","Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (57.4.0)\n","Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (4.4.2)\n","Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.7.5)\n","Requirement already satisfied: pexpect in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (4.8.0)\n","Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.11.2->marcopolo-pytorch) (2.0.1)\n","Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.3.0->marcopolo-pytorch) (2.8.2)\n","Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.3.0->marcopolo-pytorch) (0.11.0)\n","Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.3.0->marcopolo-pytorch) (3.0.9)\n","Collecting fonttools>=4.22.0\n"," Downloading fonttools-4.33.3-py3-none-any.whl (930 kB)\n","\u001b[K |████████████████████████████████| 930 kB 59.1 MB/s \n","\u001b[?25hRequirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.3.0->marcopolo-pytorch) (7.1.2)\n","Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.3.0->marcopolo-pytorch) (1.4.2)\n","Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (4.10.0)\n","Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (4.3.3)\n","Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (2.15.3)\n","Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (5.7.1)\n","Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (21.4.0)\n","Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.18.1)\n","Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.2.0->marcopolo-pytorch) (2022.1)\n","Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (1.15.0)\n","Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=4.0.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.2.5)\n","Requirement already satisfied: tzlocal in /usr/local/lib/python3.7/dist-packages (from rpy2>=3.4.2->marcopolo-pytorch) (1.5.1)\n","Requirement already satisfied: cffi>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from rpy2>=3.4.2->marcopolo-pytorch) (1.15.0)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.10.0->rpy2>=3.4.2->marcopolo-pytorch) (2.21)\n","Requirement already satisfied: statsmodels>=0.10.0rc2 in /usr/local/lib/python3.7/dist-packages (from scanpy>=1.9.0->marcopolo-pytorch) (0.10.2)\n","Requirement already satisfied: networkx>=2.3 in /usr/local/lib/python3.7/dist-packages (from scanpy>=1.9.0->marcopolo-pytorch) (2.6.3)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from scanpy>=1.9.0->marcopolo-pytorch) (1.1.0)\n","Collecting umap-learn>=0.3.10\n"," Downloading umap-learn-0.5.3.tar.gz (88 kB)\n","\u001b[K |████████████████████████████████| 88 kB 8.5 MB/s \n","\u001b[?25hRequirement already satisfied: numba>=0.41.0 in /usr/local/lib/python3.7/dist-packages (from scanpy>=1.9.0->marcopolo-pytorch) (0.51.2)\n","Requirement already satisfied: patsy in /usr/local/lib/python3.7/dist-packages (from scanpy>=1.9.0->marcopolo-pytorch) (0.5.2)\n","Collecting session-info\n"," Downloading session_info-1.0.0.tar.gz (24 kB)\n","Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba>=0.41.0->scanpy>=1.9.0->marcopolo-pytorch) (0.34.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.24.1->marcopolo-pytorch) (3.1.0)\n","Collecting pynndescent>=0.5\n"," Downloading pynndescent-0.5.7.tar.gz (1.1 MB)\n","\u001b[K |████████████████████████████████| 1.1 MB 66.2 MB/s \n","\u001b[?25hRequirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (5.3.1)\n","Requirement already satisfied: Send2Trash in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (1.8.0)\n","Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (5.6.1)\n","Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.13.3)\n","Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.7/dist-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets>=7.5.1->marcopolo-pytorch) (23.1.0)\n","Requirement already satisfied: ptyprocess in /usr/local/lib/python3.7/dist-packages (from terminado>=0.8.1->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.7.0)\n","Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.7.1)\n","Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.4)\n","Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.6.0)\n","Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.8.4)\n","Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (5.0.0)\n","Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (1.5.0)\n","Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.5.1->marcopolo-pytorch) (0.5.1)\n","Collecting stdlib_list\n"," Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)\n","\u001b[K |████████████████████████████████| 63 kB 2.1 MB/s \n","\u001b[?25hBuilding wheels for collected packages: umap-learn, pynndescent, session-info\n"," Building wheel for umap-learn (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=b10f4a5c1d8ec711514f006c15e032a1e0011c009ac3afc1c835508c3d694587\n"," Stored in directory: /root/.cache/pip/wheels/b3/52/a5/1fd9e3e76a7ab34f134c07469cd6f16e27ef3a37aeff1fe821\n"," Building wheel for pynndescent (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for pynndescent: filename=pynndescent-0.5.7-py3-none-any.whl size=54286 sha256=6407072a8cfb47bb9317310be8bdc5eb6acae320e8504a7ced57ef980e3a98df\n"," Stored in directory: /root/.cache/pip/wheels/7f/2a/f8/7bd5dcec71bd5c669f6f574db3113513696b98f3f9b51f496c\n"," Building wheel for session-info (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8048 sha256=652dcb3f1a0e01288e86c673394a7df148a8f9b8dfadbba4963ddca697ebcc7b\n"," Stored in directory: /root/.cache/pip/wheels/bd/ad/14/6a42359351a18337a8683854cfbba99dd782271f2d1767f87f\n","Successfully built umap-learn pynndescent session-info\n","Installing collected packages: scipy, fonttools, stdlib-list, pynndescent, matplotlib, umap-learn, session-info, anndata, scanpy, einops, marcopolo-pytorch\n"," Attempting uninstall: scipy\n"," Found existing installation: scipy 1.4.1\n"," Uninstalling scipy-1.4.1:\n"," Successfully uninstalled scipy-1.4.1\n"," Attempting uninstall: matplotlib\n"," Found existing installation: matplotlib 3.2.2\n"," Uninstalling matplotlib-3.2.2:\n"," Successfully uninstalled matplotlib-3.2.2\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n","Successfully installed anndata-0.8.0 einops-0.4.1 fonttools-4.33.3 marcopolo-pytorch-1.0.6 matplotlib-3.5.2 pynndescent-0.5.7 scanpy-1.9.1 scipy-1.7.3 session-info-1.0.0 stdlib-list-0.8.0 umap-learn-0.5.3\n"]},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["matplotlib","mpl_toolkits"]}}},"metadata":{}}],"source":["!pip install marcopolo-pytorch --upgrade"]},{"cell_type":"code","source":["!pip install matplotlib==3.1.3"],"metadata":{"id":"S0SRQCljZY3K"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"y3BkhfFoRwgn"},"source":["# Run MarcoPolo"]},{"cell_type":"markdown","metadata":{"id":"kXTmowerRwgo"},"source":["## Import packages"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MzCccDm6Rwgo"},"outputs":[],"source":["# Import packages\n","import pickle\n","\n","import numpy as np\n","import pandas as pd\n","import torch\n","import anndata as ad\n","import scanpy as sc\n","import matplotlib.pyplot as plt\n","\n","import MarcoPolo\n","\n","assert torch.cuda.is_available(), \"Make sure that you started the colab kernel with GPU: Runtime -> Change runtime type -> GPU\""]},{"cell_type":"markdown","metadata":{"id":"bKbzeNbIRwgp"},"source":["## Read scRNA-seq data"]},{"cell_type":"markdown","metadata":{"id":"glaoagQiRwgp"},"source":["You can use **example data** or **your own data**.\n","\n","It should be in a AnnData format. `.X` should contain a raw count matrix of shape (# cells, # genes). You can explore example datasets below"]},{"cell_type":"markdown","metadata":{"id":"RERjYa7gRwgp"},"source":["### example data\n","We have prepared two example data: the human embryogenic stem cell (hESC) dataset of Koh et al. and the liver dataset of MacParland et al. "]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dkuBS5jTRwgp","executionInfo":{"status":"ok","timestamp":1655449978043,"user_tz":420,"elapsed":2150,"user":{"displayName":"","userId":""}},"outputId":"0c460924-908b-42ce-bcb2-544ef67594af"},"outputs":[{"output_type":"stream","name":"stdout","text":["--2022-06-17 07:12:56-- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/main/notebooks/example/hESC.h5ad\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 20842419 (20M) [application/octet-stream]\n","Saving to: ‘hESC.h5ad.2’\n","\n","hESC.h5ad.2 100%[===================>] 19.88M 74.1MB/s in 0.3s \n","\n","2022-06-17 07:12:56 (74.1 MB/s) - ‘hESC.h5ad.2’ saved [20842419/20842419]\n","\n","--2022-06-17 07:12:56-- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/main/notebooks/example/HumanLiver.h5ad\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 15403217 (15M) [application/octet-stream]\n","Saving to: ‘HumanLiver.h5ad.2’\n","\n","HumanLiver.h5ad.2 100%[===================>] 14.69M 85.1MB/s in 0.2s \n","\n","2022-06-17 07:12:56 (85.1 MB/s) - ‘HumanLiver.h5ad.2’ saved [15403217/15403217]\n","\n"]}],"source":["!wget https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/main/notebooks/example/hESC.h5ad\n","!wget https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/main/notebooks/example/HumanLiver.h5ad\n"," \n","anndata_path = \"hESC.h5ad\"\n","\n","# Read anndata. `anndata_path` should be in a `h5ad` format.\n","adata = ad.read(anndata_path)\n","\n","# For fast debugging, only test first 1,000 genes.\n","adata = adata[:, :1000]"]},{"cell_type":"markdown","metadata":{"id":"l-pjW1E-Rwgq"},"source":["### your own data\n","You can upload your own AnnData single cell file to this session. If you intend to use the example data, please run the following cell and upload your data."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"W4xkBOEBRwgq"},"outputs":[],"source":["from google.colab import files\n","uploaded = files.upload()\n","\n","for file_name in uploaded.keys():\n"," print('User uploaded file \"{name}\" with length {length} bytes'.format(name=file_name, length=len(uploaded[file_name])))\n"," \n","anndata_path = file_name\n","\n","# Read anndata. `anndata_path` should be in a `h5ad` format.\n","adata = ad.read(anndata_path)"]},{"cell_type":"markdown","metadata":{"id":"OqbddBkJRwgq"},"source":["## (1) Run regression"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MTRtSieQRwgq","executionInfo":{"status":"ok","timestamp":1655450528625,"user_tz":420,"elapsed":545195,"user":{"displayName":"","userId":""}},"outputId":"c1ef0be9-41ac-4a2c-d360-08dcbdb2e53f"},"outputs":[{"output_type":"stream","name":"stdout","text":["The numbers of clusters to test: [1, 2]\n","Y: (446, 1000) X: (446, 1) s: (446,)\n","(1) Fitting with 1 cluster(s)\n"]},{"output_type":"stream","name":"stderr","text":["\rProgress: 0%| | 0/1000 [00:001 does not seem to work well on colab (maybe due to the the limited RAM).\n","\n","with open(f\"{anndata_path}.regression_result.pickle\", \"wb\") as f:\n"," pickle.dump(regression_result, f)"]},{"cell_type":"markdown","metadata":{"id":"u7wMfLjdRwgq"},"source":["## (2) Find markers"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8xRU-14nRwgr","executionInfo":{"status":"ok","timestamp":1655447888830,"user_tz":420,"elapsed":29627,"user":{"displayName":"","userId":""}},"outputId":"6ce24c40-af88-4f42-a9d8-d08da68c35c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Assign cells to on-cells and off-cells...\n","Calculating voting score...\n","Calculating proximity score...\n","Calculating bimodality score...\n","Calculating MarcoPolo score...\n"]}],"source":["# (2) Find markers\n","markers_result = MarcoPolo.find_markers(adata=adata, regression_result=regression_result)\n","with open(f\"{anndata_path}.markers_result.pickle\", \"wb\") as f:\n"," pickle.dump(markers_result, f)"]},{"cell_type":"markdown","metadata":{"id":"khw3_4pdRwgr"},"source":["## (3) Generate report"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"wbCJrulnRwgr"},"outputs":[],"source":["# Obtain tSNE coordinates if it does not exist in the adata.\n","if \"X_tsne\" not in adata.obsm.keys():\n"," sc.tl.tsne(adata=adata)"]},{"cell_type":"code","execution_count":null,"metadata":{"scrolled":false,"colab":{"base_uri":"https://localhost:8080/"},"id":"CTtTnn0XRwgr","executionInfo":{"status":"ok","timestamp":1655448920074,"user_tz":420,"elapsed":814824,"user":{"displayName":"","userId":""}},"outputId":"0b6a4a3a-3439-492c-f0b0-ec81da77a983"},"outputs":[{"output_type":"stream","name":"stdout","text":["Assign cells to on-cells and off-cells...\n","Annotating genes with the gene info: https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 1000/1000 [00:22<00:00, 45.07it/s, Num. of unmatched genes=13]\n"]},{"output_type":"stream","name":"stdout","text":["13 not matched genes: MT-CO1, MT-CO3, MT-ND4, MT-ATP6, MT-ND3, MT-CYB, MT-ND2, MT-ND1, MT-ND5, MT-ND4L, MT-ND6, AC090498.1, MT-ATP8\n","Generating table files...\n","Generating image files...\n","Drawing figures\n","size factor corrected\n"]},{"output_type":"stream","name":"stderr","text":["\r 0%| | 0/1000 [00:00"],"application/javascript":["\n"," async function download(id, filename, size) {\n"," if (!google.colab.kernel.accessAllowed) {\n"," return;\n"," }\n"," const div = document.createElement('div');\n"," const label = document.createElement('label');\n"," label.textContent = `Downloading \"${filename}\": `;\n"," div.appendChild(label);\n"," const progress = document.createElement('progress');\n"," progress.max = size;\n"," div.appendChild(progress);\n"," document.body.appendChild(div);\n","\n"," const buffers = [];\n"," let downloaded = 0;\n","\n"," const channel = await google.colab.kernel.comms.open(id);\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n","\n"," for await (const message of channel.messages) {\n"," // Send a message to notify the kernel that we're ready.\n"," channel.send({})\n"," if (message.buffers) {\n"," for (const buffer of message.buffers) {\n"," buffers.push(buffer);\n"," downloaded += buffer.byteLength;\n"," progress.value = downloaded;\n"," }\n"," }\n"," }\n"," const blob = new Blob(buffers, {type: 'application/binary'});\n"," const a = document.createElement('a');\n"," a.href = window.URL.createObjectURL(blob);\n"," a.download = filename;\n"," div.appendChild(a);\n"," a.click();\n"," div.remove();\n"," }\n"," "]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":[""],"application/javascript":["download(\"download_440e2e6b-2d24-469b-ba33-d4c4cc31e2c8\", \"report.tar.gz\", 73729602)"]},"metadata":{}}],"source":["from google.colab import files\n","\n","files.download('report.tar.gz')"]},{"cell_type":"code","source":[""],"metadata":{"id":"A8f_wjK6gGGg"},"execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"marcopolo","language":"python","name":"marcopolo"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.13"},"colab":{"name":"MarcoPolo.ipynb","provenance":[{"file_id":"https://github.com/chanwkimlab/MarcoPolo/blob/main/notebooks/MarcoPolo.ipynb","timestamp":1655489182610}],"toc_visible":true,"collapsed_sections":[]},"accelerator":"GPU","gpuClass":"standard"},"nbformat":4,"nbformat_minor":0} -------------------------------------------------------------------------------- /notebooks/example/HumanLiver.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/notebooks/example/HumanLiver.h5ad -------------------------------------------------------------------------------- /notebooks/example/hESC.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chanwkimlab/MarcoPolo/3c0f3300a2e1c264a1697e7e8325f29db597b516/notebooks/example/hESC.h5ad -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | # python -m build 4 | # python -m twine upload --repository pypi dist/* --verbose 5 | 6 | setup( 7 | name="marcopolo-pytorch", 8 | packages=find_packages(exclude=[]), 9 | include_package_data=True, 10 | version="1.0.9", 11 | description="MarcoPolo - Pytorch", 12 | author="Chanwoo Kim", 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/chanwkimlab/MarcoPolo", 15 | keywords=["single-cell", "bioinformatics", "pytorch"], 16 | install_requires=[ 17 | "tqdm", 18 | "einops>=0.3", 19 | "numpy>=1.19.2", 20 | "torch>=1.4.0", 21 | "pandas>=1.2.0", 22 | "scikit-learn>=0.24.1", 23 | "scipy>=1.6.1", 24 | "matplotlib>=3.3.0", 25 | "seaborn>=0.11.1", 26 | "Jinja2>=2.11.2", 27 | "anndata>=0.7.4", 28 | "rpy2>=3.4.2", 29 | "ipywidgets>=7.5.1", 30 | "scanpy>=1.9.0", 31 | ], 32 | classifiers=[ 33 | "Development Status :: 4 - Beta", 34 | "Intended Audience :: Developers", 35 | "Topic :: Scientific/Engineering :: Bio-Informatics", 36 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 37 | "License :: Other/Proprietary License", 38 | "Programming Language :: Python :: 3.6", 39 | "Programming Language :: Python :: 3.7", 40 | "Programming Language :: Python :: 3.8", 41 | "Programming Language :: Python :: 3.9", 42 | ], 43 | ) 44 | --------------------------------------------------------------------------------