├── scghost_overview.png ├── utilities ├── gpu.py ├── chrom_sizes.py ├── helper.py └── parsers.py ├── LICENSE ├── data ├── mm10.chrom.sizes ├── hg19.chrom.sizes └── hg38.chrom.sizes ├── modules ├── postprocessing.py ├── random_walk.py ├── preprocessing.py ├── embedding.py ├── analysis.py └── clustering.py ├── .gitignore ├── sample_configs ├── config_wtc.json ├── config_GM12878.json └── config_pfc.json ├── config.json ├── requirements.txt ├── scghost.py ├── README.md └── tutorial.ipynb /scghost_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ma-compbio/scGHOST/HEAD/scghost_overview.png -------------------------------------------------------------------------------- /utilities/gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def to_cuda(x): 4 | 5 | if torch.cuda.is_available(): 6 | return x.cuda() 7 | 8 | return x -------------------------------------------------------------------------------- /utilities/chrom_sizes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def chrom_sizes(f,length=np.inf): 4 | data = open(f,'r') 5 | 6 | sizes = {} 7 | 8 | for line in data: 9 | ldata = line.split() 10 | 11 | if len(ldata[0]) > length: 12 | continue 13 | 14 | sizes[ldata[0]] = int(ldata[1]) 15 | 16 | return sizes -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ma Lab at CMU 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data/mm10.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 195471971 2 | chr2 182113224 3 | chr3 160039680 4 | chr4 156508116 5 | chr5 151834684 6 | chr6 149736546 7 | chr7 145441459 8 | chr8 129401213 9 | chr9 124595110 10 | chr10 130694993 11 | chr11 122082543 12 | chr12 120129022 13 | chr13 120421639 14 | chr14 124902244 15 | chr15 104043685 16 | chr16 98207768 17 | chr17 94987271 18 | chr18 90702639 19 | chr19 61431566 20 | chrX 171031299 21 | chrY 91744698 22 | chrM 16299 23 | chr1_GL456210_random 169725 24 | chr1_GL456211_random 241735 25 | chr1_GL456212_random 153618 26 | chr1_GL456213_random 39340 27 | chr1_GL456221_random 206961 28 | chr4_GL456216_random 66673 29 | chr4_GL456350_random 227966 30 | chr4_JH584292_random 14945 31 | chr4_JH584293_random 207968 32 | chr4_JH584294_random 191905 33 | chr4_JH584295_random 1976 34 | chr5_GL456354_random 195993 35 | chr5_JH584296_random 199368 36 | chr5_JH584297_random 205776 37 | chr5_JH584298_random 184189 38 | chr5_JH584299_random 953012 39 | chr7_GL456219_random 175968 40 | chrUn_GL456239 40056 41 | chrUn_GL456359 22974 42 | chrUn_GL456360 31704 43 | chrUn_GL456366 47073 44 | chrUn_GL456367 42057 45 | chrUn_GL456368 20208 46 | chrUn_GL456370 26764 47 | chrUn_GL456372 28664 48 | chrUn_GL456378 31602 49 | chrUn_GL456379 72385 50 | chrUn_GL456381 25871 51 | chrUn_GL456382 23158 52 | chrUn_GL456383 38659 53 | chrUn_GL456385 35240 54 | chrUn_GL456387 24685 55 | chrUn_GL456389 28772 56 | chrUn_GL456390 24668 57 | chrUn_GL456392 23629 58 | chrUn_GL456393 55711 59 | chrUn_GL456394 24323 60 | chrUn_GL456396 21240 61 | chrUn_JH584304 114452 62 | chrX_GL456233_random 336933 63 | chrY_JH584300_random 182347 64 | chrY_JH584301_random 259875 65 | chrY_JH584302_random 155838 66 | chrY_JH584303_random 158099 -------------------------------------------------------------------------------- /data/hg19.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 249250621 2 | chr2 243199373 3 | chr3 198022430 4 | chr4 191154276 5 | chr5 180915260 6 | chr6 171115067 7 | chr7 159138663 8 | chrX 155270560 9 | chr8 146364022 10 | chr9 141213431 11 | chr10 135534747 12 | chr11 135006516 13 | chr12 133851895 14 | chr13 115169878 15 | chr14 107349540 16 | chr15 102531392 17 | chr16 90354753 18 | chr17 81195210 19 | chr18 78077248 20 | chr20 63025520 21 | chrY 59373566 22 | chr19 59128983 23 | chr22 51304566 24 | chr21 48129895 25 | chr6_ssto_hap7 4928567 26 | chr6_mcf_hap5 4833398 27 | chr6_cox_hap2 4795371 28 | chr6_mann_hap4 4683263 29 | chr6_apd_hap1 4622290 30 | chr6_qbl_hap6 4611984 31 | chr6_dbb_hap3 4610396 32 | chr17_ctg5_hap1 1680828 33 | chr4_ctg9_hap1 590426 34 | chr1_gl000192_random 547496 35 | chrUn_gl000225 211173 36 | chr4_gl000194_random 191469 37 | chr4_gl000193_random 189789 38 | chr9_gl000200_random 187035 39 | chrUn_gl000222 186861 40 | chrUn_gl000212 186858 41 | chr7_gl000195_random 182896 42 | chrUn_gl000223 180455 43 | chrUn_gl000224 179693 44 | chrUn_gl000219 179198 45 | chr17_gl000205_random 174588 46 | chrUn_gl000215 172545 47 | chrUn_gl000216 172294 48 | chrUn_gl000217 172149 49 | chr9_gl000199_random 169874 50 | chrUn_gl000211 166566 51 | chrUn_gl000213 164239 52 | chrUn_gl000220 161802 53 | chrUn_gl000218 161147 54 | chr19_gl000209_random 159169 55 | chrUn_gl000221 155397 56 | chrUn_gl000214 137718 57 | chrUn_gl000228 129120 58 | chrUn_gl000227 128374 59 | chr1_gl000191_random 106433 60 | chr19_gl000208_random 92689 61 | chr9_gl000198_random 90085 62 | chr17_gl000204_random 81310 63 | chrUn_gl000233 45941 64 | chrUn_gl000237 45867 65 | chrUn_gl000230 43691 66 | chrUn_gl000242 43523 67 | chrUn_gl000243 43341 68 | chrUn_gl000241 42152 69 | chrUn_gl000236 41934 70 | chrUn_gl000240 41933 71 | chr17_gl000206_random 41001 72 | chrUn_gl000232 40652 73 | chrUn_gl000234 40531 74 | chr11_gl000202_random 40103 75 | chrUn_gl000238 39939 76 | chrUn_gl000244 39929 77 | chrUn_gl000248 39786 78 | chr8_gl000196_random 38914 79 | chrUn_gl000249 38502 80 | chrUn_gl000246 38154 81 | chr17_gl000203_random 37498 82 | chr8_gl000197_random 37175 83 | chrUn_gl000245 36651 84 | chrUn_gl000247 36422 85 | chr9_gl000201_random 36148 86 | chrUn_gl000235 34474 87 | chrUn_gl000239 33824 88 | chr21_gl000210_random 27682 89 | chrUn_gl000231 27386 90 | chrUn_gl000229 19913 91 | chrM 16571 92 | chrUn_gl000226 15008 93 | chr18_gl000207_random 4262 94 | -------------------------------------------------------------------------------- /modules/postprocessing.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import numpy as np 4 | import torch 5 | # calibration 6 | from tqdm.auto import trange 7 | 8 | # modified function to return calibrated pairs/labels 9 | def post_process_samples(sample_data,OEMs,nearest_neighbors,neighbor_contacts=True): 10 | 11 | all_cell_chrom_samples = sample_data 12 | 13 | pos_negs = [1,-1] 14 | 15 | all_continuous_samples = [] 16 | 17 | num_cells = len(OEMs) 18 | 19 | for n in trange(num_cells, desc='post processing'): 20 | 21 | continuous_intra_samples = [] 22 | all_cell_labels = all_cell_chrom_samples[n] 23 | 24 | for pn in pos_negs: 25 | 26 | pos_neg_intra_idx = np.where((all_cell_labels == pn))[0] 27 | chrm_samples = all_cell_chrom_samples[n][pos_neg_intra_idx] 28 | 29 | adjusted_chrm_samples = np.zeros((len(chrm_samples),3)) 30 | chrm_samples = chrm_samples[:,:2].astype(int)#.cpu().numpy().astype(int) 31 | 32 | chrlen = len(OEMs[n]) 33 | 34 | chrm_linear_samples = chrm_samples[:,0] * chrlen + chrm_samples[:,1] 35 | 36 | ms_flattened = OEMs[nearest_neighbors[n]].reshape(nearest_neighbors.shape[1],-1) 37 | for i in range(ms_flattened.shape[0]): 38 | ms_flattened[i][ms_flattened[i] > 0] = ms_flattened[i][ms_flattened[i] > 0] / np.quantile(ms_flattened[i][ms_flattened[i] > 0],0.975) 39 | 40 | if torch.sum(ms_flattened[i] <= 0) > 0: 41 | ms_flattened[i][ms_flattened[i] <= 0] = -(ms_flattened[i][ms_flattened[i] <= 0] / np.quantile(ms_flattened[i][ms_flattened[i] <= 0],0.025)) 42 | 43 | ms_flattened[i][ms_flattened[i] > 1] = 1 44 | ms_flattened[i][ms_flattened[i] < -1] = -1 45 | 46 | c_flattened = ms_flattened[:,chrm_linear_samples].mean(dim=0) if neighbor_contacts else ms_flattened[0,chrm_linear_samples] 47 | adjusted_chrm_samples[:,:2] = chrm_samples 48 | adjusted_chrm_samples[:,2] = c_flattened 49 | 50 | adjusted_del_idx = np.where(np.sign(c_flattened) != pn)[0] 51 | adjusted_chrm_samples = np.delete(adjusted_chrm_samples,adjusted_del_idx,axis=0) 52 | 53 | continuous_intra_samples.append(adjusted_chrm_samples) 54 | 55 | continuous_intra_samples = torch.tensor(np.concatenate(continuous_intra_samples)).float() 56 | all_continuous_samples.append(continuous_intra_samples) 57 | 58 | all_continuous_pairs = [all_continuous_samples[i][:,:2].long() for i in range(len(all_continuous_samples))] 59 | all_continuous_labels = [all_continuous_samples[i][:,2] for i in range(len(all_continuous_samples))] 60 | 61 | return all_continuous_pairs, all_continuous_labels -------------------------------------------------------------------------------- /utilities/helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | def to_cuda(x): 5 | 6 | if torch.cuda.is_available(): 7 | return x.cuda() 8 | 9 | return x 10 | 11 | def get_expected(M,eps=1e-8): 12 | E = np.zeros_like(M) 13 | l = len(M) 14 | 15 | for i in range(M.shape[0]): 16 | contacts = np.diag(M,i) 17 | expected = contacts.sum() / (l-i) 18 | # expected = np.median(contacts) 19 | x_diag,y_diag = np.diag_indices(M.shape[0]-i) 20 | x,y = x_diag,y_diag+i 21 | E[x,y] = expected 22 | 23 | E += E.T 24 | E = np.nan_to_num(E) + eps 25 | 26 | return E 27 | 28 | def get_oe_matrix(M): 29 | E = get_expected(M) 30 | oe = np.nan_to_num(M / E) 31 | np.fill_diagonal(oe,1) 32 | 33 | return oe 34 | 35 | def random_sample(p, size, neg=False, normed=False): 36 | 37 | if not normed: 38 | p_ = p / torch.sum(p, dim=-1, keepdim=True) 39 | else: 40 | p_ = p 41 | 42 | # rg = np.random.default_rng() 43 | 44 | random_num = torch.rand(p_.shape,device=p.device) 45 | # random_num /= torch.sum(random_num, dim=-1, keepdim=True) 46 | 47 | diff = random_num - p_ 48 | 49 | # k = size 50 | sampled_weights,sampled_idx = torch.topk(diff, size, dim=-1, largest=neg) 51 | sampled_weights = sampled_weights[..., :size] if not neg else sampled_weights[..., -size:] 52 | sampled_idx = sampled_idx[..., :size] if not neg else sampled_idx[..., -size:] 53 | 54 | return sampled_weights,sampled_idx 55 | 56 | def random_sample_sorted(p, size, top=None, neg=False, normed=False): 57 | 58 | if top is not None: 59 | p = p[...,-top:] if not neg else p[...,:top] 60 | 61 | 62 | if not normed: 63 | p_ = p / torch.sum(p, dim=-1, keepdim=True) 64 | else: 65 | p_ = p 66 | 67 | # rg = np.random.default_rng() 68 | 69 | random_num = torch.rand(p_.shape,device=p.device) 70 | # random_num /= torch.sum(random_num, dim=-1, keepdim=True) 71 | 72 | diff = random_num - p_ 73 | 74 | # k = size 75 | sampled_weights,sampled_idx = torch.topk(diff, size, dim=-1, largest=neg) 76 | sampled_weights = sampled_weights[..., :size]# if not neg else sampled_weights[..., -size:] 77 | sampled_idx = sampled_idx[..., :size]# if not neg else sampled_idx[..., -size:] 78 | 79 | return sampled_weights,sampled_idx 80 | 81 | def random_sample_np(p, size, normed=False): 82 | if not normed: 83 | p_ = p / np.sum(p, axis=-1, keepdims=True) 84 | else: 85 | p_ = p 86 | rg = np.random.default_rng() 87 | random_num = rg.random(p_.shape) 88 | random_num /= np.sum(random_num, axis=1, keepdims=True) 89 | diff = random_num - p_ 90 | 91 | # k = size 92 | sampled_idx = np.argpartition(diff, size, axis=1)[..., :size] 93 | return sampled_idx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | .DS_Store 154 | .idea/inspectionProfiles/profiles_settings.xml 155 | .idea/inspectionProfiles/Project_Default.xml 156 | .idea/modules.xml 157 | .idea/scghost-dev.iml 158 | .idea/vcs.xml 159 | 160 | *.hdf5 161 | *.npy 162 | *.pdf 163 | tutorial.json 164 | bed_files/ -------------------------------------------------------------------------------- /modules/random_walk.py: -------------------------------------------------------------------------------- 1 | # parallelized constrained random walks on the GPU with random walks to inter corr matrix 2 | import os 3 | import torch 4 | import pickle 5 | import gc 6 | import numpy as np 7 | 8 | from utilities.gpu import to_cuda 9 | from utilities.parsers import parse_nearest_neighbors 10 | from utilities.helper import random_sample 11 | from modules.preprocessing import parse_chromosomes 12 | from torch.nn import functional as F 13 | from tqdm import trange 14 | 15 | def sample_chrom(chrom_num,OEMs,cell_range,nearest_neighbors,num_walks=50,use_breakpoint=False): 16 | 17 | all_cell_chrom_samples = [] 18 | layered_maps = OEMs[nearest_neighbors[cell_range]] 19 | 20 | for cnum in trange(len(cell_range)): 21 | chrm_offset = 0 22 | 23 | # m = to_cuda(torch.tensor(torch.nan_to_num(layered_maps[cnum])).float()) 24 | m = to_cuda(layered_maps[cnum].float()) # cast from bfloat16 to float32 for precision with sorting 25 | 26 | bpt = 0 27 | 28 | if use_breakpoint: 29 | bpt = len(m) // 2 30 | m = m[:bpt,bpt:] 31 | 32 | all_samples = [] 33 | 34 | num_top = int(m.shape[1] * 0.25) 35 | 36 | sorted_slc_w = torch.zeros_like(m) 37 | sorted_slc_i = torch.zeros_like(m) 38 | sorted_slc_w_T = torch.zeros_like(m) 39 | sorted_slc_i_T = torch.zeros_like(m) 40 | 41 | for i in range(m.shape[0]): 42 | sorted_slc_w[i],sorted_slc_i[i] = m[i].sort(dim=1) 43 | sorted_slc_w_T[i],sorted_slc_i_T[i] = m[i].T.sort(dim=1) 44 | 45 | sorted_slc_w = sorted_slc_w.repeat(num_walks,1,1,1) 46 | sorted_slc_w_T = sorted_slc_w_T.repeat(num_walks,1,1,1) 47 | sorted_slc_i = sorted_slc_i.repeat(num_walks,1,1,1) 48 | sorted_slc_i_T = sorted_slc_i_T.repeat(num_walks,1,1,1) 49 | 50 | test_samples = to_cuda(torch.arange(m.shape[1])) 51 | 52 | w1,i1 = sorted_slc_w[:,:,test_samples],sorted_slc_i[:,:,test_samples] 53 | 54 | pw1 = torch.exp(w1[...,-num_top:]) 55 | nw1 = 1/torch.exp(w1[...,:num_top]) # inverse to select for lower contact frequencies 56 | 57 | pi1 = i1[...,-num_top:] 58 | ni1 = i1[...,:num_top] 59 | 60 | p_mask = F.one_hot(torch.squeeze(random_sample(pw1,1)[1]),num_classes=pi1.shape[-1]) 61 | n_mask = F.one_hot(torch.squeeze(random_sample(nw1,1)[1]),num_classes=ni1.shape[-1]) 62 | 63 | pos_selection1 = ((pi1 * p_mask).sum(dim=-1)).long() 64 | neg_selection1 = ((ni1 * n_mask).sum(dim=-1)).long() 65 | 66 | pw2 = torch.gather(sorted_slc_w_T,-2,pos_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1])) 67 | pi2 = torch.gather(sorted_slc_i_T,-2,pos_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1])) 68 | 69 | pw2 = torch.exp(pw2[...,-num_top:]) 70 | pi2 = pi2[...,-num_top:] 71 | 72 | nw2 = torch.gather(sorted_slc_w_T,-2,neg_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1])) 73 | ni2 = torch.gather(sorted_slc_i_T,-2,neg_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1])) 74 | 75 | nw2 = torch.exp(nw2[...,-num_top:]) 76 | ni2 = ni2[...,-num_top:] 77 | 78 | p_mask = F.one_hot(torch.squeeze(random_sample(pw2,1)[1]),num_classes=pi2.shape[-1]) 79 | n_mask = F.one_hot(torch.squeeze(random_sample(nw2,1)[1]),num_classes=ni2.shape[-1]) 80 | 81 | pos_selection2 = ((pi2 * p_mask).sum(dim=-1)).long() 82 | neg_selection2 = ((ni2 * n_mask).sum(dim=-1)).long() 83 | 84 | for i in range(num_walks): 85 | selections = to_cuda(torch.stack(( 86 | pos_selection1[i].flatten() + bpt, 87 | pos_selection2[i].flatten(), 88 | neg_selection1[i].flatten() + bpt, 89 | neg_selection2[i].flatten() 90 | )).T.flatten()) 91 | 92 | labels = to_cuda(torch.tensor([1,1,-1,-1]).repeat(len(m) * len(test_samples))) 93 | 94 | interactions = torch.stack(( 95 | test_samples.repeat_interleave(4 * len(m)) + chrm_offset, 96 | selections, 97 | labels, 98 | )).T 99 | 100 | all_samples.append(interactions) 101 | 102 | all_samples = torch.unique(torch.cat(all_samples),dim=0).cpu().numpy().astype(np.int16) 103 | all_cell_chrom_samples.append(all_samples) 104 | 105 | return all_cell_chrom_samples -------------------------------------------------------------------------------- /sample_configs/config_wtc.json: -------------------------------------------------------------------------------- 1 | { 2 | "schic_directory" : "/mnt/e/data/wtc", 3 | "label_info": null, 4 | "data_directory" : "/mnt/e/data/scghost_wtc_output", 5 | "chromosomes" : { 6 | "1" : { 7 | "adj" : "chr1_sparse_adj.npy", 8 | "imputed" : "chr1_exp1_nbr_5_impute.hdf5", 9 | "integer" : 1 10 | }, 11 | "2" : { 12 | "adj" : "chr2_sparse_adj.npy", 13 | "imputed" : "chr2_exp1_nbr_5_impute.hdf5", 14 | "integer" : 2 15 | }, 16 | "3" : { 17 | "adj" : "chr3_sparse_adj.npy", 18 | "imputed" : "chr3_exp1_nbr_5_impute.hdf5", 19 | "integer" : 3 20 | }, 21 | "4" : { 22 | "adj" : "chr4_sparse_adj.npy", 23 | "imputed" : "chr4_exp1_nbr_5_impute.hdf5", 24 | "integer" : 4 25 | }, 26 | "5" : { 27 | "adj" : "chr5_sparse_adj.npy", 28 | "imputed" : "chr5_exp1_nbr_5_impute.hdf5", 29 | "integer" : 5 30 | }, 31 | "6" : { 32 | "adj" : "chr6_sparse_adj.npy", 33 | "imputed" : "chr6_exp1_nbr_5_impute.hdf5", 34 | "integer" : 6 35 | }, 36 | "7" : { 37 | "adj" : "chr7_sparse_adj.npy", 38 | "imputed" : "chr7_exp1_nbr_5_impute.hdf5", 39 | "integer" : 7 40 | }, 41 | "8" : { 42 | "adj" : "chr8_sparse_adj.npy", 43 | "imputed" : "chr8_exp1_nbr_5_impute.hdf5", 44 | "integer" : 8 45 | }, 46 | "9" : { 47 | "adj" : "chr9_sparse_adj.npy", 48 | "imputed" : "chr9_exp1_nbr_5_impute.hdf5", 49 | "integer" : 9 50 | }, 51 | "10" : { 52 | "adj" : "chr10_sparse_adj.npy", 53 | "imputed" : "chr10_exp1_nbr_5_impute.hdf5", 54 | "integer" : 10 55 | }, 56 | "11" : { 57 | "adj" : "chr11_sparse_adj.npy", 58 | "imputed" : "chr11_exp1_nbr_5_impute.hdf5", 59 | "integer" : 11 60 | }, 61 | "12" : { 62 | "adj" : "chr12_sparse_adj.npy", 63 | "imputed" : "chr12_exp1_nbr_5_impute.hdf5", 64 | "integer" : 12 65 | }, 66 | "13" : { 67 | "adj" : "chr13_sparse_adj.npy", 68 | "imputed" : "chr13_exp1_nbr_5_impute.hdf5", 69 | "integer" : 13 70 | }, 71 | "14" : { 72 | "adj" : "chr14_sparse_adj.npy", 73 | "imputed" : "chr14_exp1_nbr_5_impute.hdf5", 74 | "integer" : 14 75 | }, 76 | "15" : { 77 | "adj" : "chr15_sparse_adj.npy", 78 | "imputed" : "chr15_exp1_nbr_5_impute.hdf5", 79 | "integer" : 15 80 | }, 81 | "16" : { 82 | "adj" : "chr16_sparse_adj.npy", 83 | "imputed" : "chr16_exp1_nbr_5_impute.hdf5", 84 | "integer" : 16 85 | }, 86 | "17" : { 87 | "adj" : "chr17_sparse_adj.npy", 88 | "imputed" : "chr17_exp1_nbr_5_impute.hdf5", 89 | "integer" : 17 90 | }, 91 | "18" : { 92 | "adj" : "chr18_sparse_adj.npy", 93 | "imputed" : "chr18_exp1_nbr_5_impute.hdf5", 94 | "integer" : 18 95 | }, 96 | "19" : { 97 | "adj" : "chr19_sparse_adj.npy", 98 | "imputed" : "chr19_exp1_nbr_5_impute.hdf5", 99 | "integer" : 19 100 | }, 101 | "20" : { 102 | "adj" : "chr20_sparse_adj.npy", 103 | "imputed" : "chr20_exp1_nbr_5_impute.hdf5", 104 | "integer" : 20 105 | }, 106 | "21" : { 107 | "adj" : "chr21_sparse_adj.npy", 108 | "imputed" : "chr21_exp1_nbr_5_impute.hdf5", 109 | "integer" : 21 110 | }, 111 | "22" : { 112 | "adj" : "chr22_sparse_adj.npy", 113 | "imputed" : "chr22_exp1_nbr_5_impute.hdf5", 114 | "integer" : 22 115 | } 116 | }, 117 | "chrom_sizes" : "data/hg38.chrom.sizes", 118 | "chrom_indices" : null, 119 | "embeddings_path" : "/mnt/e/data/wtc/embed/exp1_0_origin.npy", 120 | "higashi_scab_path" : "/mnt/e/data/wtc/scAB_with_nbr.hdf5", 121 | "cell_type" : null, 122 | "random_walk" : { 123 | "num_walks" : 25, 124 | "ignore_top" : 0.02, 125 | "top_percentile" : 0.25 126 | }, 127 | "eps": 1e-8, 128 | "num_clusters" : 5, 129 | "batch_size" : 16, 130 | "epochs" : 5, 131 | "resolution" : 500000, 132 | "neighbor_contacts" : true, 133 | "nearest_neighbor_override" : null, 134 | "gpu_uniques" : true, 135 | "cluster_gpu_caching" : true, 136 | "kmeans_init" : 1 137 | } -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "schic_directory" : "/directory/of/higashi/imputed/maps", 3 | "label_info": { 4 | "path":"/path/to/label_info.pickle", 5 | "cell_type_key": "cell type key in label_info.pickle" 6 | }, 7 | "data_directory" : "/directory/to/save/scghost/outputs/", 8 | "chromosomes" : { 9 | "1" : { 10 | "adj" : "chr1_sparse_adj.npy", 11 | "imputed" : "chr1_exp1_nbr_5_impute.hdf5", 12 | "integer" : 1 13 | }, 14 | "2" : { 15 | "adj" : "chr2_sparse_adj.npy", 16 | "imputed" : "chr2_exp1_nbr_5_impute.hdf5", 17 | "integer" : 2 18 | }, 19 | "3" : { 20 | "adj" : "chr3_sparse_adj.npy", 21 | "imputed" : "chr3_exp1_nbr_5_impute.hdf5", 22 | "integer" : 3 23 | }, 24 | "4" : { 25 | "adj" : "chr4_sparse_adj.npy", 26 | "imputed" : "chr4_exp1_nbr_5_impute.hdf5", 27 | "integer" : 4 28 | }, 29 | "5" : { 30 | "adj" : "chr5_sparse_adj.npy", 31 | "imputed" : "chr5_exp1_nbr_5_impute.hdf5", 32 | "integer" : 5 33 | }, 34 | "6" : { 35 | "adj" : "chr6_sparse_adj.npy", 36 | "imputed" : "chr6_exp1_nbr_5_impute.hdf5", 37 | "integer" : 6 38 | }, 39 | "7" : { 40 | "adj" : "chr7_sparse_adj.npy", 41 | "imputed" : "chr7_exp1_nbr_5_impute.hdf5", 42 | "integer" : 7 43 | }, 44 | "8" : { 45 | "adj" : "chr8_sparse_adj.npy", 46 | "imputed" : "chr8_exp1_nbr_5_impute.hdf5", 47 | "integer" : 8 48 | }, 49 | "9" : { 50 | "adj" : "chr9_sparse_adj.npy", 51 | "imputed" : "chr9_exp1_nbr_5_impute.hdf5", 52 | "integer" : 9 53 | }, 54 | "10" : { 55 | "adj" : "chr10_sparse_adj.npy", 56 | "imputed" : "chr10_exp1_nbr_5_impute.hdf5", 57 | "integer" : 10 58 | }, 59 | "11" : { 60 | "adj" : "chr11_sparse_adj.npy", 61 | "imputed" : "chr11_exp1_nbr_5_impute.hdf5", 62 | "integer" : 11 63 | }, 64 | "12" : { 65 | "adj" : "chr12_sparse_adj.npy", 66 | "imputed" : "chr12_exp1_nbr_5_impute.hdf5", 67 | "integer" : 12 68 | }, 69 | "13" : { 70 | "adj" : "chr13_sparse_adj.npy", 71 | "imputed" : "chr13_exp1_nbr_5_impute.hdf5", 72 | "integer" : 13 73 | }, 74 | "14" : { 75 | "adj" : "chr14_sparse_adj.npy", 76 | "imputed" : "chr14_exp1_nbr_5_impute.hdf5", 77 | "integer" : 14 78 | }, 79 | "15" : { 80 | "adj" : "chr15_sparse_adj.npy", 81 | "imputed" : "chr15_exp1_nbr_5_impute.hdf5", 82 | "integer" : 15 83 | }, 84 | "16" : { 85 | "adj" : "chr16_sparse_adj.npy", 86 | "imputed" : "chr16_exp1_nbr_5_impute.hdf5", 87 | "integer" : 16 88 | }, 89 | "17" : { 90 | "adj" : "chr17_sparse_adj.npy", 91 | "imputed" : "chr17_exp1_nbr_5_impute.hdf5", 92 | "integer" : 17 93 | }, 94 | "18" : { 95 | "adj" : "chr18_sparse_adj.npy", 96 | "imputed" : "chr18_exp1_nbr_5_impute.hdf5", 97 | "integer" : 18 98 | }, 99 | "19" : { 100 | "adj" : "chr19_sparse_adj.npy", 101 | "imputed" : "chr19_exp1_nbr_5_impute.hdf5", 102 | "integer" : 19 103 | }, 104 | "20" : { 105 | "adj" : "chr20_sparse_adj.npy", 106 | "imputed" : "chr20_exp1_nbr_5_impute.hdf5", 107 | "integer" : 20 108 | }, 109 | "21" : { 110 | "adj" : "chr21_sparse_adj.npy", 111 | "imputed" : "chr21_exp1_nbr_5_impute.hdf5", 112 | "integer" : 21 113 | }, 114 | "22" : { 115 | "adj" : "chr22_sparse_adj.npy", 116 | "imputed" : "chr22_exp1_nbr_5_impute.hdf5", 117 | "integer" : 22 118 | } 119 | }, 120 | "chrom_sizes" : "data/hg38.chrom.sizes", 121 | "chrom_indices" : null, 122 | "embeddings_path" : "/path/to/exp1_0_origin.npy", 123 | "higashi_scab_path" : "/path/to/higashi/scAB.hdf5", 124 | "cell_type" : null, 125 | "random_walk" : { 126 | "num_walks" : 50, 127 | "ignore_top" : 0.02, 128 | "top_percentile" : 0.25 129 | }, 130 | "eps": 1e-8, 131 | "num_clusters" : 5, 132 | "batch_size" : 16, 133 | "epochs" : 5, 134 | "resolution" : 500000, 135 | "neighbor_contacts" : false, 136 | "nearest_neighbor_override" : null, 137 | "gpu_uniques" : true, 138 | "cluster_gpu_caching" : true, 139 | "kmeans_init" : 1 140 | } -------------------------------------------------------------------------------- /modules/preprocessing.py: -------------------------------------------------------------------------------- 1 | from cProfile import run 2 | import numpy as np 3 | import pickle 4 | import gc 5 | import os 6 | import h5py 7 | 8 | from utilities.parsers import parse_chromosomes, parse_cell_types 9 | from utilities.chrom_sizes import chrom_sizes 10 | from scipy.sparse import coo_matrix 11 | from tqdm import trange 12 | 13 | def compute_chrom_indices(runtime_args,save=True): 14 | 15 | if runtime_args['chrom_indices'] is not None: 16 | return 17 | 18 | chromosomes = parse_chromosomes(runtime_args) 19 | cell_type_index = parse_cell_types(runtime_args) 20 | 21 | chrom_indices = {} 22 | 23 | for n in trange(len(chromosomes)): 24 | 25 | chrom = chromosomes[n] 26 | adj_path = runtime_args['chromosomes'][chrom]['adj'] 27 | 28 | sparse_M = np.load(os.path.join(runtime_args['schic_directory'],adj_path),allow_pickle=True) 29 | sparse_M = sparse_M[cell_type_index] if cell_type_index is not None else sparse_M 30 | 31 | M = sparse_M.sum(axis=0).toarray() 32 | 33 | nongap = np.where(np.sum(M > 0, axis=-1, keepdims=False) >= (0.1 * M.shape[0]))[0] 34 | 35 | chrom_indices[chrom] = nongap 36 | 37 | gc.collect() 38 | 39 | data_dir = runtime_args['data_directory'] 40 | 41 | if save: 42 | pickle.dump(chrom_indices,open( 43 | os.path.join(data_dir,'chrom_indices.pkl'),'wb' 44 | )) 45 | 46 | return chrom_indices 47 | 48 | def extract_OEMs(fname,cell_type_index,chrom_indices,num_cells,chrom_num,chrom_start_end,save_path=None,eps=1e-8): 49 | f = h5py.File(fname) 50 | 51 | chrom_size = chrom_start_end[chrom_num-1,1] - chrom_start_end[chrom_num-1,0] 52 | coords = np.array(f['coordinates']) 53 | 54 | if cell_type_index is None: 55 | cti = [] 56 | for i in range(len(f)): 57 | if 'cell_%d' % i in f: 58 | cti.append(i) 59 | 60 | cell_type_index = np.array(cti) 61 | 62 | num_cells = len(cell_type_index) if num_cells is None else np.min([num_cells,len(cell_type_index)]) 63 | cells_data = np.array([np.array(f['cell_%d' % cell_type_index[i]]) for i in range(num_cells)]) 64 | 65 | OEMs = [] 66 | Ms = [] 67 | 68 | for cell_num in trange(num_cells): 69 | M = coo_matrix((cells_data[cell_num],(coords[:,0],coords[:,1])),shape=(chrom_size,chrom_size)).toarray() 70 | M += M.T 71 | 72 | # construct expected matrix 73 | E = np.zeros_like(M) 74 | l = len(M) 75 | 76 | for i in range(M.shape[0]): 77 | contacts = np.diag(M,i) 78 | expected = contacts.sum() / (l-i) 79 | # expected = np.median(contacts) 80 | x_diag,y_diag = np.diag_indices(M.shape[0]-i) 81 | x,y = x_diag,y_diag+i 82 | E[x,y] = expected 83 | 84 | E += E.T 85 | E = np.nan_to_num(E) + eps 86 | 87 | OE = M / E 88 | OE = OE[chrom_indices][:,chrom_indices] 89 | OE[OE == 0] = 1 90 | OE = np.log(OE) 91 | Ms.append(M[chrom_indices][:,chrom_indices]) 92 | OEMs.append(OE) 93 | 94 | OEMs = np.array(OEMs) 95 | Ms = np.array(Ms) 96 | 97 | # print(OEMs.shape) 98 | if save_path is None: 99 | return OEMs#, Ms 100 | else: 101 | # np.savez_compressed(save_path,oe=OEMs,observed=Ms) 102 | np.save(save_path,OEMs) 103 | # np.save(save_path+'_observed',Ms) 104 | 105 | def compute_observed_over_expected(runtime_args): 106 | 107 | chrom_start_end = np.load(os.path.join(runtime_args['schic_directory'],'chrom_start_end.npy')) 108 | cell_type = runtime_args['cell_type'] 109 | chrom_indices = pickle.load(open(os.path.join(runtime_args['data_directory'],'chrom_indices.pkl'),'rb')) 110 | chromosomes = parse_chromosomes(runtime_args) 111 | 112 | for n in range(len(chromosomes)): 113 | 114 | chrom_num = chromosomes[n] 115 | 116 | if runtime_args['chromosomes'][chrom_num]['matrix'] is not None: 117 | continue 118 | 119 | impute_path = runtime_args['chromosomes'][chrom_num]['imputed'] 120 | 121 | cell_type_index = parse_cell_types(runtime_args) 122 | 123 | extract_OEMs( 124 | os.path.join(runtime_args['schic_directory'],impute_path), 125 | cell_type_index, 126 | chrom_indices[chrom_num], 127 | None, 128 | runtime_args['chromosomes'][chrom_num]['integer'], 129 | chrom_start_end, 130 | save_path=os.path.join(runtime_args['data_directory'],'{0}_oe'.format(chrom_num)), 131 | eps=runtime_args['eps'] 132 | ) 133 | 134 | print('{0} complete'.format(chrom_num)) 135 | gc.collect() -------------------------------------------------------------------------------- /sample_configs/config_GM12878.json: -------------------------------------------------------------------------------- 1 | { 2 | "schic_directory" : "/mnt/e/data/4dn_orig", 3 | "label_info": { 4 | "path":"/mnt/e/data/4dn_scihic/label_info_4DN_sciHiC.pickle", 5 | "cell_type_key": "cell type" 6 | }, 7 | "data_directory" : "/mnt/e/data/scghost_GM12878_output", 8 | "chromosomes" : { 9 | "1" : { 10 | "adj" : "chr1_sparse_adj.npy", 11 | "imputed" : "chr1_exp1_nbr_5_impute.hdf5", 12 | "integer" : 1 13 | }, 14 | "2" : { 15 | "adj" : "chr2_sparse_adj.npy", 16 | "imputed" : "chr2_exp1_nbr_5_impute.hdf5", 17 | "integer" : 2 18 | }, 19 | "3" : { 20 | "adj" : "chr3_sparse_adj.npy", 21 | "imputed" : "chr3_exp1_nbr_5_impute.hdf5", 22 | "integer" : 3 23 | }, 24 | "4" : { 25 | "adj" : "chr4_sparse_adj.npy", 26 | "imputed" : "chr4_exp1_nbr_5_impute.hdf5", 27 | "integer" : 4 28 | }, 29 | "5" : { 30 | "adj" : "chr5_sparse_adj.npy", 31 | "imputed" : "chr5_exp1_nbr_5_impute.hdf5", 32 | "integer" : 5 33 | }, 34 | "6" : { 35 | "adj" : "chr6_sparse_adj.npy", 36 | "imputed" : "chr6_exp1_nbr_5_impute.hdf5", 37 | "integer" : 6 38 | }, 39 | "7" : { 40 | "adj" : "chr7_sparse_adj.npy", 41 | "imputed" : "chr7_exp1_nbr_5_impute.hdf5", 42 | "integer" : 7 43 | }, 44 | "8" : { 45 | "adj" : "chr8_sparse_adj.npy", 46 | "imputed" : "chr8_exp1_nbr_5_impute.hdf5", 47 | "integer" : 8 48 | }, 49 | "9" : { 50 | "adj" : "chr9_sparse_adj.npy", 51 | "imputed" : "chr9_exp1_nbr_5_impute.hdf5", 52 | "integer" : 9 53 | }, 54 | "10" : { 55 | "adj" : "chr10_sparse_adj.npy", 56 | "imputed" : "chr10_exp1_nbr_5_impute.hdf5", 57 | "integer" : 10 58 | }, 59 | "11" : { 60 | "adj" : "chr11_sparse_adj.npy", 61 | "imputed" : "chr11_exp1_nbr_5_impute.hdf5", 62 | "integer" : 11 63 | }, 64 | "12" : { 65 | "adj" : "chr12_sparse_adj.npy", 66 | "imputed" : "chr12_exp1_nbr_5_impute.hdf5", 67 | "integer" : 12 68 | }, 69 | "13" : { 70 | "adj" : "chr13_sparse_adj.npy", 71 | "imputed" : "chr13_exp1_nbr_5_impute.hdf5", 72 | "integer" : 13 73 | }, 74 | "14" : { 75 | "adj" : "chr14_sparse_adj.npy", 76 | "imputed" : "chr14_exp1_nbr_5_impute.hdf5", 77 | "integer" : 14 78 | }, 79 | "15" : { 80 | "adj" : "chr15_sparse_adj.npy", 81 | "imputed" : "chr15_exp1_nbr_5_impute.hdf5", 82 | "integer" : 15 83 | }, 84 | "16" : { 85 | "adj" : "chr16_sparse_adj.npy", 86 | "imputed" : "chr16_exp1_nbr_5_impute.hdf5", 87 | "integer" : 16 88 | }, 89 | "17" : { 90 | "adj" : "chr17_sparse_adj.npy", 91 | "imputed" : "chr17_exp1_nbr_5_impute.hdf5", 92 | "integer" : 17 93 | }, 94 | "18" : { 95 | "adj" : "chr18_sparse_adj.npy", 96 | "imputed" : "chr18_exp1_nbr_5_impute.hdf5", 97 | "integer" : 18 98 | }, 99 | "19" : { 100 | "adj" : "chr19_sparse_adj.npy", 101 | "imputed" : "chr19_exp1_nbr_5_impute.hdf5", 102 | "integer" : 19 103 | }, 104 | "20" : { 105 | "adj" : "chr20_sparse_adj.npy", 106 | "imputed" : "chr20_exp1_nbr_5_impute.hdf5", 107 | "integer" : 20 108 | }, 109 | "21" : { 110 | "adj" : "chr21_sparse_adj.npy", 111 | "imputed" : "chr21_exp1_nbr_5_impute.hdf5", 112 | "integer" : 21 113 | }, 114 | "22" : { 115 | "adj" : "chr22_sparse_adj.npy", 116 | "imputed" : "chr22_exp1_nbr_5_impute.hdf5", 117 | "integer" : 22 118 | } 119 | }, 120 | "chrom_sizes" : "data/hg38.chrom.sizes", 121 | "chrom_indices" : "/mnt/e/data/scghost_GM12878_output/chrom_indices.pkl", 122 | "embeddings_path" : "/mnt/e/data/4dn_scihic/exp1_0_origin.npy", 123 | "higashi_scab_path" : "/mnt/e/data/4dn_scihic/scAB.hdf5", 124 | "cell_type" : "GM12878", 125 | "random_walk" : { 126 | "num_walks" : 25, 127 | "ignore_top" : 0.02, 128 | "top_percentile" : 0.25 129 | }, 130 | "eps": 1e-8, 131 | "num_clusters" : 5, 132 | "batch_size" : 16, 133 | "epochs" : 5, 134 | "resolution" : 500000, 135 | "neighbor_contacts" : false, 136 | "nearest_neighbor_override" : null, 137 | "gpu_uniques" : true, 138 | "cluster_gpu_caching" : true, 139 | "kmeans_init" : 1 140 | } -------------------------------------------------------------------------------- /modules/embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import pickle 5 | import gc 6 | 7 | from tqdm.auto import trange 8 | from modules.preprocessing import parse_chromosomes 9 | from torch import nn 10 | from tqdm import trange 11 | from torch.nn import functional as F 12 | 13 | num_cells = 500 14 | 15 | def to_cuda(x): 16 | if torch.cuda.is_available(): 17 | return x.cuda() 18 | 19 | return x 20 | 21 | 22 | class hubs(nn.Module): 23 | def __init__(self, N, num_cells, hidden_dim=128): 24 | super(hubs, self).__init__() 25 | self.N = N 26 | self.num_cells = num_cells 27 | self.hidden_dim = hidden_dim 28 | 29 | self.embedding = nn.Embedding(self.N * self.num_cells, 30 | self.hidden_dim, sparse=True, max_norm=1) 31 | 32 | to_cuda(self) 33 | 34 | def to_one_hot(self, x): 35 | return F.one_hot(x, num_classes=self.N * self.num_cells).float() 36 | 37 | def embed(self, x): 38 | return self.embedding(x) 39 | 40 | 41 | def prep_pairs_labels(all_pairs, all_labels, gap, indices=None, thresh=None): 42 | concatenated_pairs = [] 43 | concatenated_labels = [] 44 | 45 | lengths = [len(xx) for xx in all_pairs] 46 | clip_len = np.min(lengths) if thresh is None else thresh 47 | iterable = range(len(all_pairs)) if indices is None else indices 48 | 49 | kept_cells = [] 50 | n = 0 51 | 52 | # Random permutation within each cell 53 | for i in iterable: 54 | cell_pairs = all_pairs[i] 55 | 56 | if thresh is not None and len(cell_pairs) < thresh: 57 | continue 58 | 59 | id_ = torch.randperm(len(cell_pairs))[:clip_len] 60 | concatenated_pairs.append(cell_pairs[id_] + n * gap) 61 | concatenated_labels.append(all_labels[i][id_]) 62 | 63 | n += 1 64 | kept_cells.append(i) 65 | 66 | # Stack instead of concat, shape of (#cell, #pairs, 2) 67 | concatenated_pairs = torch.stack(concatenated_pairs, dim=0) 68 | concatenated_labels = torch.stack(concatenated_labels, dim=0) 69 | 70 | return (concatenated_pairs, concatenated_labels) if thresh is None else (concatenated_pairs,concatenated_labels,np.array(kept_cells)) 71 | 72 | 73 | def embed_single_cells_unified(all_continuous_pairs, all_continuous_labels, OEMs, embedding_file, epochs=1, 74 | cell_nums=None, batch_size=64, verbose=False, prepped=False): 75 | cell_nums = np.arange(len(all_continuous_pairs)) if cell_nums is None else cell_nums 76 | 77 | model = hubs(len(OEMs[0]), len(cell_nums), hidden_dim=128) 78 | bs = batch_size 79 | 80 | all_Es = [] 81 | optimizer = torch.optim.SparseAdam(model.parameters()) 82 | 83 | if not prepped: 84 | all_continuous_pairs, all_continuous_labels = prep_pairs_labels(all_continuous_pairs, 85 | all_continuous_labels, 86 | OEMs[0].shape[0], 87 | indices=cell_nums) 88 | for epoch in range(epochs): 89 | 90 | shuffle_id = torch.randperm(all_continuous_pairs.shape[1]) 91 | 92 | N_pairs = all_continuous_pairs.shape[-2] 93 | rloss = 0 94 | rsamples = 0 95 | bar = trange(0, N_pairs, bs) if verbose else range(0, N_pairs, bs) 96 | 97 | for i in bar: 98 | # During training, sample a batch of pairs from each cell (can be small 16 yields good results) 99 | # You can also sample a batch of cells as well, but that needs to be sth large, like 2k cells etc. 100 | x = model.embed(to_cuda(all_continuous_pairs[:, shuffle_id[i:i + bs], :])) 101 | x1 = x[:, :, 0] 102 | x2 = x[:, :, 1] 103 | 104 | y = to_cuda(all_continuous_labels[:, shuffle_id[i:i + bs]]) 105 | sim = F.cosine_similarity(x1, x2, dim=-1) 106 | optimizer.zero_grad() 107 | loss = F.mse_loss(sim, y) 108 | loss.backward() 109 | optimizer.step() 110 | 111 | blen = x.shape[1] 112 | rloss += float(loss) * blen 113 | rsamples += blen 114 | 115 | print('Epoch %d: %d/%d -- %.6f loss' % (epoch, rsamples, N_pairs, rloss / rsamples), end='\r') 116 | 117 | print() 118 | 119 | num_loci = model.N * model.num_cells 120 | bar = trange(0, num_loci, bs) if verbose else range(0, num_loci, bs) 121 | 122 | for i in bar: 123 | end = np.min([i + bs, num_loci]) 124 | 125 | x = model.embed(to_cuda(torch.arange(i, end))).to_dense().detach().cpu().numpy() 126 | all_Es.append(x) 127 | 128 | all_Es = np.vstack(all_Es) # final shape - (num_cells * num_chrom_loci, hidden_dim) 129 | 130 | all_Es = all_Es.reshape((model.num_cells, model.N, model.hidden_dim)) 131 | 132 | np.save(embedding_file, all_Es) -------------------------------------------------------------------------------- /sample_configs/config_pfc.json: -------------------------------------------------------------------------------- 1 | { 2 | "schic_directory" : "/mnt/e/data/pfc", 3 | "label_info": { 4 | "path":"/mnt/e/data/pfc/label_info.pickle", 5 | "cell_type_key": "cluster label" 6 | }, 7 | "data_directory" : "/mnt/e/data/scghost_pfc_output", 8 | "chromosomes" : { 9 | "1" : { 10 | "adj" : "chr1_sparse_adj.npy", 11 | "imputed" : "chr1_exp5_zinb_nbr_5_impute.hdf5", 12 | "integer" : 1 13 | }, 14 | "2" : { 15 | "adj" : "chr2_sparse_adj.npy", 16 | "imputed" : "chr2_exp5_zinb_nbr_5_impute.hdf5", 17 | "integer" : 2 18 | }, 19 | "3" : { 20 | "adj" : "chr3_sparse_adj.npy", 21 | "imputed" : "chr3_exp5_zinb_nbr_5_impute.hdf5", 22 | "integer" : 3 23 | }, 24 | "4" : { 25 | "adj" : "chr4_sparse_adj.npy", 26 | "imputed" : "chr4_exp5_zinb_nbr_5_impute.hdf5", 27 | "integer" : 4 28 | }, 29 | "5" : { 30 | "adj" : "chr5_sparse_adj.npy", 31 | "imputed" : "chr5_exp5_zinb_nbr_5_impute.hdf5", 32 | "integer" : 5 33 | }, 34 | "6" : { 35 | "adj" : "chr6_sparse_adj.npy", 36 | "imputed" : "chr6_exp5_zinb_nbr_5_impute.hdf5", 37 | "integer" : 6 38 | }, 39 | "7" : { 40 | "adj" : "chr7_sparse_adj.npy", 41 | "imputed" : "chr7_exp5_zinb_nbr_5_impute.hdf5", 42 | "integer" : 7 43 | }, 44 | "8" : { 45 | "adj" : "chr8_sparse_adj.npy", 46 | "imputed" : "chr8_exp5_zinb_nbr_5_impute.hdf5", 47 | "integer" : 8 48 | }, 49 | "9" : { 50 | "adj" : "chr9_sparse_adj.npy", 51 | "imputed" : "chr9_exp5_zinb_nbr_5_impute.hdf5", 52 | "integer" : 9 53 | }, 54 | "10" : { 55 | "adj" : "chr10_sparse_adj.npy", 56 | "imputed" : "chr10_exp5_zinb_nbr_5_impute.hdf5", 57 | "integer" : 10 58 | }, 59 | "11" : { 60 | "adj" : "chr11_sparse_adj.npy", 61 | "imputed" : "chr11_exp5_zinb_nbr_5_impute.hdf5", 62 | "integer" : 11 63 | }, 64 | "12" : { 65 | "adj" : "chr12_sparse_adj.npy", 66 | "imputed" : "chr12_exp5_zinb_nbr_5_impute.hdf5", 67 | "integer" : 12 68 | }, 69 | "13" : { 70 | "adj" : "chr13_sparse_adj.npy", 71 | "imputed" : "chr13_exp5_zinb_nbr_5_impute.hdf5", 72 | "integer" : 13 73 | }, 74 | "14" : { 75 | "adj" : "chr14_sparse_adj.npy", 76 | "imputed" : "chr14_exp5_zinb_nbr_5_impute.hdf5", 77 | "integer" : 14 78 | }, 79 | "15" : { 80 | "adj" : "chr15_sparse_adj.npy", 81 | "imputed" : "chr15_exp5_zinb_nbr_5_impute.hdf5", 82 | "integer" : 15 83 | }, 84 | "16" : { 85 | "adj" : "chr16_sparse_adj.npy", 86 | "imputed" : "chr16_exp5_zinb_nbr_5_impute.hdf5", 87 | "integer" : 16 88 | }, 89 | "17" : { 90 | "adj" : "chr17_sparse_adj.npy", 91 | "imputed" : "chr17_exp5_zinb_nbr_5_impute.hdf5", 92 | "integer" : 17 93 | }, 94 | "18" : { 95 | "adj" : "chr18_sparse_adj.npy", 96 | "imputed" : "chr18_exp5_zinb_nbr_5_impute.hdf5", 97 | "integer" : 18 98 | }, 99 | "19" : { 100 | "adj" : "chr19_sparse_adj.npy", 101 | "imputed" : "chr19_exp5_zinb_nbr_5_impute.hdf5", 102 | "integer" : 19 103 | }, 104 | "20" : { 105 | "adj" : "chr20_sparse_adj.npy", 106 | "imputed" : "chr20_exp5_zinb_nbr_5_impute.hdf5", 107 | "integer" : 20 108 | }, 109 | "21" : { 110 | "adj" : "chr21_sparse_adj.npy", 111 | "imputed" : "chr21_exp5_zinb_nbr_5_impute.hdf5", 112 | "integer" : 21 113 | }, 114 | "22" : { 115 | "adj" : "chr22_sparse_adj.npy", 116 | "imputed" : "chr22_exp5_zinb_nbr_5_impute.hdf5", 117 | "integer" : 22 118 | } 119 | }, 120 | "chrom_sizes" : "data/hg19.chrom.sizes", 121 | "chrom_indices" : "/mnt/e/data/scghost_pfc_output/chrom_indices.pkl", 122 | "embeddings_path" : "/mnt/e/data/pfc/embed/exp5_zinb_0_origin.npy", 123 | "higashi_scab_path" : "/mnt/e/data/pfc/scAB.hdf5", 124 | "cell_type" : null, 125 | "random_walk" : { 126 | "num_walks" : 25, 127 | "ignore_top" : 0.02, 128 | "top_percentile" : 0.25 129 | }, 130 | "eps": 1e-8, 131 | "num_clusters" : 5, 132 | "batch_size" : 16, 133 | "epochs" : 5, 134 | "resolution" : 500000, 135 | "neighbor_contacts" : true, 136 | "nearest_neighbor_override" : null, 137 | "gpu_uniques" : true, 138 | "cluster_gpu_caching" : true, 139 | "kmeans_init" : 1 140 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.6 2 | aiosignal==1.3.1 3 | anyio==4.0.0 4 | argon2-cffi==23.1.0 5 | argon2-cffi-bindings==21.2.0 6 | arrow==1.3.0 7 | async-timeout==4.0.3 8 | attrs==23.1.0 9 | beautifulsoup4==4.12.2 10 | bleach==6.1.0 11 | bokeh==3.3.0 12 | Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work 13 | cachetools==5.3.2 14 | certifi==2023.7.22 15 | cffi @ file:///croot/cffi_1670423208954/work 16 | charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work 17 | click==8.1.7 18 | click-plugins==1.1.1 19 | cligj==0.7.2 20 | cloudpickle==3.0.0 21 | colorcet==3.0.1 22 | contourpy==1.1.1 23 | cryptography @ file:///croot/cryptography_1694444244250/work 24 | cubinlinker-cu11==0.3.0.post1 25 | cucim==23.10.0 26 | cuda-python==11.8.2 27 | cudf-cu11==23.10.0 28 | cugraph-cu11==23.10.0 29 | cuml-cu11==23.10.0 30 | cuproj-cu11==23.10.0 31 | cupy-cuda11x==12.2.0 32 | cuspatial-cu11==23.10.0 33 | cuxfilter-cu11==23.10.0 34 | Cython==3.0.4 35 | dask==2023.9.2 36 | dask-cuda==23.10.0 37 | dask-cudf-cu11==23.10.0 38 | datashader==0.16.0 39 | defusedxml==0.7.1 40 | distributed==2023.9.2 41 | exceptiongroup==1.1.3 42 | fastjsonschema==2.18.1 43 | fastrlock==0.8.2 44 | filelock @ file:///croot/filelock_1672387128942/work 45 | fiona==1.9.5 46 | fqdn==1.5.1 47 | frozenlist==1.4.0 48 | fsspec==2023.10.0 49 | geopandas==0.14.0 50 | gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work 51 | h5py @ file:///croot/h5py_1691589708553/work 52 | hmmlearn @ file:///home/conda/feedstock_root/build_artifacts/hmmlearn_1696709150716/work 53 | holoviews==1.18.0 54 | idna @ file:///croot/idna_1666125576474/work 55 | importlib-metadata==6.8.0 56 | isoduration==20.11.0 57 | Jinja2 @ file:///croot/jinja2_1666908132255/work 58 | joblib @ file:///croot/joblib_1685113087166/work 59 | jsonpointer==2.4 60 | jsonschema==4.19.1 61 | jsonschema-specifications==2023.7.1 62 | jupyter-events==0.8.0 63 | jupyter_client==8.5.0 64 | jupyter_core==5.4.0 65 | jupyter_server==2.9.1 66 | jupyter_server_proxy==4.1.0 67 | jupyter_server_terminals==0.4.4 68 | jupyterlab-pygments==0.2.2 69 | lazy_loader==0.3 70 | linkify-it-py==2.0.2 71 | llvmlite==0.40.1 72 | locket==1.0.0 73 | Markdown==3.5 74 | markdown-it-py==3.0.0 75 | MarkupSafe @ file:///opt/conda/conda-bld/markupsafe_1654597864307/work 76 | mdit-py-plugins==0.4.0 77 | mdurl==0.1.2 78 | mistune==3.0.2 79 | mkl-fft @ file:///croot/mkl_fft_1695058164594/work 80 | mkl-random @ file:///croot/mkl_random_1695059800811/work 81 | mkl-service==2.4.0 82 | mpmath @ file:///croot/mpmath_1690848262763/work 83 | msgpack==1.0.7 84 | multidict==6.0.4 85 | multipledispatch==1.0.0 86 | nbclient==0.8.0 87 | nbconvert==7.9.2 88 | nbformat==5.9.2 89 | networkx @ file:///croot/networkx_1690561992265/work 90 | numba==0.57.1 91 | numpy==1.24.3 92 | nvtx==0.2.8 93 | opencv-python==4.7.0.72 94 | overrides==7.4.0 95 | packaging==23.2 96 | pandas==1.5.3 97 | pandocfilters==1.5.0 98 | panel==1.3.0 99 | param==2.0.0 100 | partd==1.4.1 101 | Pillow==9.5.0 102 | platformdirs==3.11.0 103 | prometheus-client==0.17.1 104 | protobuf==4.24.4 105 | psutil==5.9.6 106 | ptxcompiler-cu11==0.7.0.post1 107 | ptyprocess==0.7.0 108 | pyarrow==12.0.1 109 | pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work 110 | pyct==0.5.0 111 | Pygments==2.16.1 112 | pylibcugraph-cu11==23.10.0 113 | pylibraft-cu11==23.10.0 114 | pynvml==11.4.1 115 | pyOpenSSL @ file:///croot/pyopenssl_1690223430423/work 116 | pyproj==3.6.1 117 | PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work 118 | python-dateutil==2.8.2 119 | python-json-logger==2.0.7 120 | pytz==2023.3 121 | pyviz_comms==3.0.0 122 | PyYAML @ file:///croot/pyyaml_1698096049011/work 123 | pyzmq==25.1.1 124 | raft-dask-cu11==23.10.0 125 | referencing==0.30.2 126 | requests @ file:///croot/requests_1690400202158/work 127 | rfc3339-validator==0.1.4 128 | rfc3986-validator==0.1.1 129 | rmm-cu11==23.10.0 130 | rpds-py==0.10.6 131 | scikit-learn @ file:///croot/scikit-learn_1690978916802/work 132 | scipy @ file:///croot/scipy_1696543286448/work/dist/scipy-1.11.3-cp310-cp310-linux_x86_64.whl#sha256=16a8c87e543daeb96d1310b2283f542fef2de84ee7090f60187eb71f71cb430c 133 | Send2Trash==1.8.2 134 | shapely==2.0.2 135 | simpervisor==1.0.0 136 | six==1.16.0 137 | sniffio==1.3.0 138 | sortedcontainers==2.4.0 139 | soupsieve==2.5 140 | sympy @ file:///croot/sympy_1668202399572/work 141 | tblib==3.0.0 142 | terminado==0.17.1 143 | threadpoolctl==3.2.0 144 | tinycss2==1.2.1 145 | toolz==0.12.0 146 | torch==2.1.0 147 | torchaudio==2.1.0 148 | torchvision==0.16.0 149 | tornado==6.3.3 150 | tqdm==4.65.0 151 | traitlets==5.12.0 152 | treelite==3.9.1 153 | treelite-runtime==3.9.1 154 | triton==2.1.0 155 | types-python-dateutil==2.8.19.14 156 | typing_extensions @ file:///croot/typing_extensions_1690297465030/work 157 | tzdata==2023.3 158 | uc-micro-py==1.0.2 159 | ucx-py-cu11==0.34.0 160 | uri-template==1.3.0 161 | urllib3 @ file:///croot/urllib3_1698257533958/work 162 | webcolors==1.13 163 | webencodings==0.5.1 164 | websocket-client==1.6.4 165 | xarray==2023.10.1 166 | xyzservices==2023.10.1 167 | yarl==1.9.2 168 | zict==3.0.0 169 | zipp==3.17.0 170 | -------------------------------------------------------------------------------- /scghost.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import torch 5 | import numpy as np 6 | import gc 7 | 8 | from utilities.parsers import parse_config, parse_chromosomes, parse_cell_types, parse_nearest_neighbors 9 | 10 | from modules.preprocessing import compute_chrom_indices, extract_OEMs 11 | from modules.postprocessing import post_process_samples 12 | from modules.random_walk import sample_chrom 13 | from modules.embedding import embed_single_cells_unified, prep_pairs_labels 14 | from modules.clustering import scghost_clustering 15 | from tqdm import trange 16 | 17 | if __name__ == '__main__': 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--config', metavar='cfg', type=str,default='./config.json',help='Path to the configuration file') 21 | 22 | args = parser.parse_args() 23 | 24 | runtime_args = parse_config(args.config) 25 | 26 | os.makedirs(runtime_args['data_directory'],exist_ok=True) 27 | 28 | num_walks = runtime_args['random_walk']['num_walks'] 29 | neighbor_contacts = True if 'neighbor_contacts' not in runtime_args else runtime_args['neighbor_contacts'] 30 | gpu_uniques = False if 'gpu_uniques' not in runtime_args else runtime_args['gpu_uniques'] 31 | 32 | # define globals 33 | print('Parsing chromosomes') 34 | chromosomes = parse_chromosomes(runtime_args) 35 | 36 | print('Parsing chromosome indices') 37 | chrom_indices = compute_chrom_indices(runtime_args) if runtime_args[ 38 | 'chrom_indices' 39 | ] is None else pickle.load(open(runtime_args['chrom_indices'],'rb')) 40 | 41 | print('Parsing cell types') 42 | cell_type = runtime_args['cell_type'] 43 | cell_type_index = parse_cell_types(runtime_args) 44 | 45 | print('Parsing remaining global variables') 46 | chrom_start_end = np.load(os.path.join(runtime_args['schic_directory'],'chrom_start_end.npy')) 47 | 48 | nearest_neighbors = None 49 | 50 | if 'nearest_neighbor_override' in runtime_args and runtime_args['nearest_neighbor_override'] is not None: 51 | print('Using nearest neighbor override') 52 | nearest_neighbors = np.load(runtime_args['nearest_neighbor_override']) if runtime_args['nearest_neighbor_override'] is not None else parse_nearest_neighbors(runtime_args) 53 | else: 54 | nearest_neighbors = parse_nearest_neighbors(runtime_args) 55 | 56 | batch_size = runtime_args['batch_size'] 57 | n_epochs = runtime_args['epochs'] 58 | 59 | # per chromosome loop 60 | for chrom in chromosomes: 61 | 62 | # if embedding already generated, skip 63 | if os.path.exists( 64 | os.path.join(runtime_args['data_directory'],'{0}_embeddings.npy'.format(chrom)) 65 | ): 66 | continue 67 | 68 | print('Processing chromosome {0}'.format(chrom)) 69 | impute_path = runtime_args['chromosomes'][chrom]['imputed'] 70 | 71 | # compute O/E matrices 72 | 73 | oem_override = None if 'oe_matrices' not in runtime_args['chromosomes'][chrom] else runtime_args['chromosomes'][chrom]['oe_matrices'] 74 | 75 | OEMs = extract_OEMs( 76 | os.path.join(runtime_args['schic_directory'],impute_path), 77 | cell_type_index, 78 | chrom_indices[chrom], 79 | None, 80 | runtime_args['chromosomes'][chrom]['integer'], 81 | chrom_start_end, 82 | save_path=None, 83 | eps=runtime_args['eps'] 84 | ) if oem_override is None else np.load(oem_override)['contact_maps'] 85 | gc.collect() 86 | 87 | # random walk 88 | OEMs = torch.tensor(OEMs) 89 | corr_OEMs = torch.zeros_like(OEMs) 90 | 91 | for i in trange(len(OEMs)): 92 | corr_OEMs[i] = torch.nan_to_num(torch.corrcoef(OEMs[i])) 93 | corr_OEMs[i].fill_diagonal_(0) 94 | 95 | corr_OEMs = corr_OEMs.type(torch.bfloat16) 96 | 97 | gc.collect() 98 | 99 | all_cell_chrom_samples = sample_chrom(chrom,corr_OEMs,np.arange(len(corr_OEMs)),nearest_neighbors,num_walks=num_walks) 100 | 101 | del corr_OEMs 102 | gc.collect() 103 | torch.cuda.empty_cache() 104 | 105 | # label calibration 106 | all_continuous_pairs,all_continuous_labels = post_process_samples( 107 | all_cell_chrom_samples, 108 | OEMs, 109 | nearest_neighbors, 110 | neighbor_contacts=neighbor_contacts 111 | ) 112 | 113 | all_continuous_pairs,all_continuous_labels = prep_pairs_labels( 114 | all_continuous_pairs, 115 | all_continuous_labels, 116 | OEMs[0].shape[0], 117 | np.arange(len(OEMs)) 118 | ) 119 | 120 | del all_cell_chrom_samples 121 | gc.collect() 122 | torch.cuda.empty_cache() 123 | 124 | # embedding 125 | output_file = os.path.join(runtime_args['data_directory'], '{0}_embeddings'.format(chrom)) 126 | 127 | embed_single_cells_unified( 128 | all_continuous_pairs, 129 | all_continuous_labels, 130 | OEMs, 131 | output_file, 132 | epochs=n_epochs, 133 | cell_nums=None, 134 | batch_size=batch_size, 135 | verbose=True, 136 | prepped=True 137 | ) 138 | 139 | del all_continuous_labels,all_continuous_pairs 140 | 141 | gc.collect() 142 | torch.cuda.empty_cache() 143 | 144 | # cluster on all embeddings 145 | print('Clustering') 146 | scghost_clustering(runtime_args) -------------------------------------------------------------------------------- /utilities/parsers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import torch 4 | import h5py 5 | import pickle 6 | import gc 7 | import os 8 | 9 | from utilities.chrom_sizes import chrom_sizes 10 | from utilities.helper import to_cuda 11 | from sklearn.neighbors import NearestNeighbors 12 | 13 | def parse_config(config_filepath): 14 | with open(config_filepath) as config_file: 15 | config_data = json.load(config_file) 16 | 17 | return config_data 18 | 19 | def parse_higashi_scab(runtime_args): 20 | scAB = h5py.File(runtime_args['higashi_scab_path']) 21 | chromosomes = parse_chromosomes(runtime_args) 22 | 23 | scAB_chrom = np.array(scAB['compartment']['bin']['chrom']).astype(str) 24 | scAB_start = np.array(scAB['compartment']['bin']['start']) 25 | 26 | hig_scab = [] 27 | 28 | num_cells = 0 29 | 30 | for i in range(len(scAB['compartment'])): 31 | if 'cell_%d' % i in scAB['compartment']: 32 | num_cells += 1 33 | 34 | for cn in range(num_cells): 35 | hig_scab.append(scAB['compartment']['cell_%d' % cn]) 36 | 37 | hig_scab = np.array(hig_scab) 38 | 39 | return (hig_scab,scAB_chrom,scAB_start) 40 | 41 | def parse_chrom_embeds(runtime_args,cuda=True): 42 | 43 | chromosomes = parse_chromosomes(runtime_args) 44 | gpu_caching = False if 'cluster_gpu_caching' not in runtime_args else runtime_args['cluster_gpu_caching'] 45 | 46 | hig_scab,scAB_chrom,scAB_start = parse_higashi_scab(runtime_args) 47 | 48 | N = len(hig_scab) 49 | 50 | chrom_embeds = {} 51 | chrom_highlow = {} 52 | 53 | resolution = runtime_args['resolution'] 54 | 55 | for chrom in chromosomes: 56 | 57 | ci_path = os.path.join(runtime_args['data_directory'],'chrom_indices.pkl') if runtime_args['chrom_indices'] is None else runtime_args['chrom_indices'] 58 | chrom_indices = pickle.load( 59 | open(ci_path,'rb') 60 | )['{0}'.format(chrom)] 61 | 62 | scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(chrom))[0] 63 | _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // resolution,chrom_indices,return_indices=True) 64 | scab_indices = scab_chrom_indices[scab_crop] 65 | scghost_indices = chrom_indices[scghost_crop] 66 | 67 | scab_highidx = np.argsort(hig_scab[:,scab_indices],axis=1)[:,-25:] 68 | scab_lowidx = np.argsort(hig_scab[:,scab_indices],axis=1)[:,:25] 69 | 70 | chrom_highlow['{0}'.format(chrom)] = { 71 | 'high' : to_cuda(torch.tensor(scab_highidx)) if gpu_caching else torch.tensor(scab_highidx), 72 | 'low' : to_cuda(torch.tensor(scab_lowidx)) if gpu_caching else torch.tensor(scab_lowidx) 73 | } 74 | 75 | embedding_flag = ('embeddings' in runtime_args['chromosomes'][chrom]) 76 | 77 | embed_path = os.path.join( 78 | runtime_args['data_directory'],'{0}_embeddings.npy'.format(chrom) 79 | ) 80 | if embedding_flag and runtime_args['chromosomes'][chrom]['embeddings'] is not None: 81 | embed_path = runtime_args['chromosomes'][chrom]['embeddings'] 82 | 83 | scembeds = np.load(embed_path) 84 | scembeds = scembeds[:,scghost_crop] 85 | 86 | chrom_embeds['{0}'.format(chrom)] = to_cuda(torch.tensor(scembeds)) if gpu_caching else torch.tensor(scembeds) 87 | 88 | gc.collect() 89 | 90 | return { 91 | 'embeds':chrom_embeds, 92 | 'highlow':chrom_highlow, 93 | 'N':N, 94 | } 95 | 96 | def parse_chromosomes(runtime_args): 97 | 98 | sizes = chrom_sizes(runtime_args['chrom_sizes']) 99 | chromosomes = runtime_args['chromosomes'] 100 | 101 | # deprecate this if condition 102 | if chromosomes == 'autosomes': 103 | chrom_list = [] 104 | 105 | for chrom in sizes: 106 | chrom_num = chrom[3:] 107 | if chrom_num.isnumeric(): 108 | chrom_list.append(int(chrom_num)) 109 | chromosomes = np.array(chrom_list) 110 | else: 111 | chromosomes = np.array([c for c in chromosomes]) 112 | 113 | return chromosomes 114 | 115 | def parse_nearest_neighbors(runtime_args): 116 | 117 | cell_type = runtime_args['cell_type'] 118 | label_info = pickle.load(open(runtime_args['label_info']['path'],'rb')) if runtime_args['label_info'] is not None else None 119 | 120 | embeddings = np.load(runtime_args['embeddings_path']) 121 | 122 | if label_info is not None and cell_type is not None: 123 | cell_type_key = runtime_args['label_info']['cell_type_key'] 124 | cell_types = np.array(label_info[cell_type_key]).astype(str) 125 | cell_type_index = np.where(cell_types == cell_type) 126 | 127 | embeddings = embeddings[cell_type_index] 128 | 129 | nbrs = NearestNeighbors(n_neighbors=6).fit(embeddings) 130 | _,indices = nbrs.kneighbors(embeddings) 131 | 132 | return indices 133 | 134 | def parse_cell_types(runtime_args): 135 | 136 | if runtime_args['label_info'] is None: 137 | return 138 | 139 | label_info = pickle.load(open(runtime_args['label_info']['path'],'rb')) 140 | 141 | cell_type = runtime_args['cell_type'] 142 | 143 | if cell_type is None: 144 | return 145 | 146 | cell_type_filter = cell_type is not None 147 | cell_types = np.array(label_info[runtime_args['label_info']['cell_type_key']]).astype(str) 148 | cell_type_index = np.where(cell_types == cell_type)[0] if cell_type_filter else np.arange(len(cell_types)) 149 | 150 | return cell_type_index -------------------------------------------------------------------------------- /modules/analysis.py: -------------------------------------------------------------------------------- 1 | # sort chromosome subcompartments using single cell AB compartments from Higashi 2 | import h5py 3 | import os 4 | import seaborn as sns 5 | 6 | os.environ["OMP_NUM_THREADS"] = "10" 7 | 8 | import numpy as np 9 | from tqdm import trange, tqdm 10 | import pickle 11 | import pandas as pd 12 | import argparse 13 | from umap import UMAP 14 | from fbpca import pca 15 | from sklearn.preprocessing import StandardScaler, quantile_transform 16 | from sklearn.decomposition import PCA 17 | from scipy.stats import rankdata 18 | from concurrent.futures import ProcessPoolExecutor, as_completed 19 | import matplotlib.pyplot as plt 20 | import json 21 | 22 | def get_expected(M,eps=1e-8): 23 | E = np.zeros_like(M) 24 | l = len(M) 25 | 26 | for i in range(M.shape[0]): 27 | contacts = np.diag(M,i) 28 | expected = contacts.sum() / (l-i) 29 | # expected = np.median(contacts) 30 | x_diag,y_diag = np.diag_indices(M.shape[0]-i) 31 | x,y = x_diag,y_diag+i 32 | E[x,y] = expected 33 | 34 | E += E.T 35 | E = np.nan_to_num(E) + eps 36 | 37 | return E 38 | 39 | def get_oe_matrix(M): 40 | E = get_expected(M) 41 | oe = np.nan_to_num(M / E) 42 | np.fill_diagonal(oe,1) 43 | 44 | return oe 45 | 46 | # tailored for k=5 and pfc 47 | def prep_scatterplot(embeddings_dir,chrom_indices_file,scAB_file,output_file='tutorial_embeds.hdf5'): 48 | 49 | chrom_indices = pickle.load(open(chrom_indices_file,'rb')) 50 | stacked_pcs = [] 51 | 52 | for chrom_num in range(1,23): 53 | chrom_indices = pickle.load(open('/mnt/e/data/scghost_pfc_output/chrom_indices.pkl','rb'))['%d' % chrom_num] 54 | 55 | sparse_M = np.load('/mnt/e/data/pfc/chr%d_sparse_adj.npy' % chrom_num,allow_pickle=True) 56 | pseudo_bulk = sparse_M.sum(axis=0).toarray() 57 | cov = np.sqrt(pseudo_bulk.sum(axis=1)) 58 | pseudo_bulk /= cov[None,:] 59 | pseudo_bulk /= cov[:,None] 60 | pseudo_bulk = np.nan_to_num(pseudo_bulk)[chrom_indices][:,chrom_indices] 61 | pseudo_OE = get_oe_matrix(pseudo_bulk) 62 | 63 | Rpool = np.nan_to_num(np.corrcoef(pseudo_OE)) 64 | Rpoolmean = Rpool.mean(axis=0,keepdims=True) 65 | Rpool = Rpool - Rpoolmean 66 | _,_,V = np.linalg.svd(Rpool) 67 | 68 | Es = np.load(os.path.join(embeddings_dir,f'/mnt/e/data/scghost_pfc_output/{chrom_num}_embeddings.npy')) 69 | embedding_corrs = np.zeros((Es.shape[0],Es.shape[1],Es.shape[1])) 70 | 71 | num_cells = len(Es) 72 | 73 | for i in trange(num_cells): 74 | embedding_corrs[i] = np.corrcoef(Es[i]) 75 | 76 | pcs = np.zeros((Es.shape[0],Es.shape[1])) 77 | 78 | for i,ec in enumerate(embedding_corrs): 79 | tec = ec - Rpoolmean 80 | pc = tec.dot(V[0,:].T) 81 | pcs[i] = pc 82 | 83 | stacked_pcs.append(pcs) 84 | 85 | stacked_pcs = np.hstack(stacked_pcs) 86 | 87 | with h5py.File(output_file,'w') as f: 88 | f.create_group('compartment') 89 | f['compartment'].create_group('bin') 90 | 91 | for i in range(num_cells): 92 | f['compartment'].create_dataset('cell_%d' % i,data=stacked_pcs[i]) 93 | 94 | 95 | def get_config(config_path = "./config.jSON"): 96 | c = open(config_path,"r") 97 | return json.load(c) 98 | 99 | 100 | def parse_args(): 101 | parser = argparse.ArgumentParser(description="Higashi single cell compartment calling") 102 | parser.add_argument('-c', '--config', type=str, default="./config.JSON") 103 | 104 | return parser.parse_args() 105 | 106 | 107 | def get_palette(label_order, label_name=None, config=None): 108 | try: 109 | palette = config['vis_palette'][label_name] 110 | except: 111 | pal1 = list(sns.color_palette("Paired")) 112 | pal2 = list(sns.color_palette("Set2")) 113 | pal3 = list(sns.color_palette("husl", 12)) 114 | # pal = pal1 + pal2 + pal3 + pal1 115 | # pal = pal1 + pal3 + pal2 116 | pal_all = pal1 + pal2 + pal3 + pal1 + pal2 + pal3 117 | if len(label_order) <= 10: 118 | palette = list([f'C{_}' for _ in range(len(label_order))]) 119 | else: 120 | palette = pal_all[:len(label_order)] 121 | return palette 122 | 123 | 124 | def sc_compartment2embedding(embeds_path,data_dir,output_file="tutorial_scatterplot.pdf",extra="", save_name=""): 125 | label_info = pickle.load(open(os.path.join(data_dir, "label_info.pickle"), "rb")) 126 | label = np.array(label_info["cluster label"]) 127 | print(label) 128 | 129 | ids = np.arange(4238) 130 | label = label[ids] 131 | total_feats = [] 132 | 133 | with h5py.File(embeds_path, "r") as cp_f: 134 | print(cp_f.keys()) 135 | cp = cp_f['compartment'] 136 | 137 | for id_ in trange(len(label)): 138 | v = np.array(cp['cell_%d' % id_]) 139 | total_feats.append(v) 140 | 141 | feats = np.stack(total_feats, axis=0) 142 | print(feats.shape) 143 | 144 | pal = get_palette(np.unique(label)) 145 | 146 | pal = {'L2/3': '#e51f4e', 'L4': '#45af4b', 'L5': '#ffe011', 'L6': '#0081cc', 147 | 'Ndnf': '#ff7f35', 'Vip': '#951eb7', 'Pvalb': '#4febee', 148 | 'Sst': '#ed37d9', 'Astro': '#d1f33c', 'ODC': '#f9bdbb', 149 | 'OPC': '#067d81', 'MG': '#e4bcfc', 'MP': '#ab6c1e', 150 | "Endo": '#780100'} 151 | 152 | 153 | 154 | temp = quantile_transform(feats, output_distribution='uniform', n_quantiles=int(1.0 * feats.shape[0])) 155 | print(feats.shape) 156 | size = 32 157 | pca = PCA(n_components=size) 158 | temp = pca.fit_transform(temp) 159 | 160 | vec = UMAP(n_components=2).fit_transform(temp) 161 | fig, ax = plt.subplots(figsize=(7, 5)) 162 | sns.scatterplot(x=vec[:, 0], y=vec[:, 1], hue=label, linewidth=0, s=2, alpha=1.0, palette=pal) 163 | # 164 | handles, labels = ax.get_legend_handles_labels() 165 | labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0])) 166 | ax.legend(handles=handles, labels=labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 167 | plt.tight_layout() 168 | plt.savefig(output_file, dpi=300) 169 | plt.close('all') 170 | 171 | return (vec,label,pal) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview of scGHOST 2 | 3 | ![Overview of scGHOST](scghost_overview.png) 4 | 5 | scGHOST is an unsupervised single-cell subcompartment annotation method based on graph embedding with constrained random walk sampling. 6 | scGHOST is designed to be run on a single-cell Hi-C (scHi-C) dataset which has undergone imputation by [Higashi](https://github.com/ma-compbio/Higashi) ([Zhang et al. 2022](https://www.nature.com/articles/s41587-021-01034-y)). 7 | scGHOST assigns embeddings to genomic loci in the genomes of individual cells by viewing scHi-C as graphs whose vertices are genomic loci and edges are the contact frequencies among loci. 8 | While scGHOST is developed for scHi-C data, it can also identify single-cell subcompartments in single-cell genome imaging data. 9 | 10 | # Running scGHOST 11 | 12 | ## Input data 13 | 14 | scGHOST uses the outputs from [Higashi](https://github.com/ma-compbio/Higashi) as its inputs. 15 | Specifically, it requires the scHi-C imputations (hdf5 format), per-cell embeddings (numpy format), sparse raw scHi-C adjacency maps (numpy format), the scA/B scores (hdf5 format), and the label info file (pickle format) describing the cell types corresponding to each cell in the dataset. 16 | 17 | ## Installation 18 | 19 | Before installing any Python packages, we strongly recommend using Anaconda (please refer to the [Anaconda](https://anaconda.org/) webpage for `conda` installation instructions) to create a python 3.10 environment using the following command: 20 | 21 | `conda install --name scghost python=3.10` 22 | 23 | After creating the environment, activate it using: 24 | 25 | `conda activate scghost` 26 | 27 | ### Dependencies 28 | 29 | #### Conda installations 30 | - PyTorch (2.1.0) with CUDA (11.8) 31 | - Scikit-learn (latest) 32 | - h5py 33 | #### Pip installations 34 | - [cuML for CUDA 11.8](https://docs.rapids.ai/install#selector) 35 | - [Thread-pool Controls](https://pypi.org/project/threadpoolctl/) (> 3) 36 | 37 | Users can install scGHOST dependencies using the `conda` or `pip` commands following the specifications above. 38 | 39 | Systems without a CUDA-capable GPU can also install scGHOST using the same dependencies and installing PyTorch for CPU only, but will have to modify the source code in `modules/clustering.py` to use `SKMeans` instead of `KMeans` under the `scghost_clustering_reworked` function. We may add a flag in the config file to run CPU only instead, but from our experience running scGHOST on the CPU only takes far longer than on a GPU and is not recommended. 40 | 41 | ## Hardware Requirements 42 | 43 | scGHOST can use up to 40 GB of memory for a single-cell dataset of 4,238 cells. 44 | Considering operating system overhead, we recommend running scGHOST on a machine with at least 64 GB of memory to avoid poor performance or out-of-memory errors at runtime. 45 | 46 | scGHOST was developed on a system with a 12-core 12th generation Intel CPU, an Nvidia RTX 3090 GPU with 24GB of VRAM, and 64GB of system memory. With GPU caching enabled, scGHOST uses a maximum of 15 GB of VRAM on the PFC dataset. With GPU caching disabled, VRAM becomes less of a limiting factor and scGHOST should run on any CUDA-capable GPU with at least 4 GB of VRAM. 47 | 48 | ## Usage 49 | 50 | Users can run scGHOST using the following command: 51 | 52 | `python scghost.py --config ` 53 | 54 | Sample JSON config files for scGHOST have been provided. 55 | 56 | `configuration` is the filepath to a custom configuration file adhering to the JSON format for scGHOST. By default, scGHOST uses the included config.json file, which can be modified to the user's specifications. 57 | 58 | **Note**: users may run into a `RuntimeWarning` after the clustering step. This is normal behavior and should not affect the accuracy of results. 59 | 60 | ## Runtime 61 | scGHOST was run on a machine with a 12-core 12th generation Intel CPU and Nvidia RTX 3090 24GB GPU. 62 | From scratch, scGHOST takes about 2 hours to run on the sciHi-C GM12878 dataset and about 4 hours to run on the human prefrontal cortex dataset. 63 | 64 | ## Configuration file 65 | 66 | - `schic_directory` : the directory containing Higashi-imputed single-cell Hi-C maps. 67 | - `label_info` : `label_info.pickle` file following the [format in Higashi](https://github.com/ma-compbio/Higashi/wiki/Input-Files). 68 | - `path` : the file path of the `label_info.pickle` file 69 | - `cell_type_key` : the key in `label_info.pickle` with a list of the cell types in the dataset 70 | - `data_directory` : the output directory of scGHOST 71 | - `chromosomes` : the list of chromosomes to apply scGHOST to. default: autosomes 72 | - `chrom_sizes` : file path to the chromosome sizes file. default: `data/hg38.chrom.sizes` 73 | - `chrom_indices` : file path to chrom indices if previously computed. Development flag to save time over multiple runs on the same dataset. Default: `null` 74 | - `embeddings_path` : file path to the Higashi embeddings `.npy` file for each cell in the dataset 75 | - `higashi_scab_path` : file path to Higashi scA/B scores `.h5` file 76 | - `cell_type` : the cell type in the dataset to apply scGHOST on; use `null` to apply scGHOST to all cell types in the dataset. default: `null` 77 | - `random_walk` : random walk parameters 78 | - `num_walks` : number of random walks per iteration. default: 50 79 | - `ignore_top` : the top and bottom percentile to be ignored, to remove extreme values in the input matrix. default: 0.02 80 | - `top_percentile` : the top percentiles within which random walks are performed. default: 0.25 81 | - `eps` : small float value to prevent dividing by zero in some functions. default: 1e-8 82 | - `num_clusters` : number of clusters to partition chromosomes into 83 | - `neighbor_contacts` : determine whether to use the average of nearest neighbor contacts as the target label during node embedding. 84 | - `nearest_neighbor_override` : use a custom numpy array to define nearest neighbors. The format should be an `N x (k+1)` array with `N` denoting the number of cells in the dataset and `k` denoting the number of nearest neighbors. Row `i` in the array should contain entries denoting which cells are the nearest neighbors of cell `i`. 85 | - `cluster_gpu_caching` : toggle caching chromosome embeddings on the GPU prior to clustering to reduce CPU overhead converting embedding vectors to cuda variables. We recomend disabling this if your GPU memory is less than 16 GB. 86 | - `gpu_uniques` : determine whether to use the GPU to compute unique random walk samples. On machines with higher CPU core counts, CPU processing may be faster than GPU processing. 87 | - `kmeans_init` : the `n_init` parameter in scikit-learn/cuML's `KMeans`. We set this value at a default of 1 to reduce clustering runtime. 88 | 89 | ## Tutorials 90 | 91 | Please follow our tutorial notebooks in the root directory for examples on how to run scGHOST with and without first running Higashi. For a sample run of scGHOST, users can download the smaller WTC-11 dataset [here](http://genome.compbio.cs.cmu.edu:8008/~kxiong/data/scghost/wtc11/). After downloading the sample data, please change the `sample_configs/config_wtc.json` configuration file accordingly to point to the correct paths and run the following command: 92 | 93 | `python scghost.py --config sample_configs/config_wtc.json` 94 | 95 | ## Contact 96 | Please email jianma@cs.cmu.edu or raise an issue in the github repository with any questions about installation or usage or any encountered bugs. 97 | -------------------------------------------------------------------------------- /modules/clustering.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import pickle 5 | import gc 6 | 7 | from hmmlearn.hmm import GaussianHMM 8 | from sklearn.preprocessing import quantile_transform as sk_quantile_transform 9 | from sklearn.cluster import KMeans 10 | from utilities.parsers import parse_chrom_embeds, parse_chromosomes, parse_chromosomes, parse_higashi_scab, parse_cell_types 11 | from utilities.helper import to_cuda 12 | from sklearn.cluster import KMeans as SKMeans 13 | from tqdm import trange, tqdm 14 | from cuml import KMeans 15 | 16 | DEFAULT_KMEANS_INIT = 10 17 | 18 | def quantile_transform(data, n_quantiles=None): 19 | 20 | nq = n_quantiles if n_quantiles is not None else len(data) 21 | 22 | sorted_data, sort_id = torch.sort(data) 23 | data[sort_id] = torch.arange(data.shape[0],device=data.device,dtype=data.dtype) 24 | data = torch.floor(data / data.shape[0] * nq) / nq 25 | return data 26 | 27 | def scghost_clustering(runtime_args): 28 | embed_data = parse_chrom_embeds(runtime_args) 29 | chromosomes = parse_chromosomes(runtime_args) 30 | cell_type_index = parse_cell_types(runtime_args) 31 | gpu_caching = False if 'cluster_gpu_caching' not in runtime_args else runtime_args['cluster_gpu_caching'] 32 | 33 | kmeans_init = DEFAULT_KMEANS_INIT if 'kmeans_init' not in runtime_args else runtime_args['kmeans_init'] 34 | 35 | chrom_embeds = embed_data['embeds'] 36 | chrom_highlow = embed_data['highlow'] 37 | 38 | N = embed_data['N'] 39 | bar = trange(N) if cell_type_index is None else tqdm(cell_type_index) 40 | # bar = trange(N) if cell_type_index is None else tqdm(range(5)) 41 | 42 | cell_labels = [] 43 | cell_labels_transpose = [] 44 | 45 | for cn in bar: 46 | # for cn in trange(25): 47 | 48 | inter_matrix = [] 49 | 50 | for ii in range(0,len(chromosomes),2): 51 | chrom1 = chromosomes[ii] 52 | embed1 = chrom_embeds['{0}'.format(chrom1)][cn] 53 | embed1 = embed1 if gpu_caching else to_cuda(embed1) 54 | corr1 = torch.corrcoef(embed1) 55 | 56 | hi1 = chrom_highlow['{0}'.format(chrom1)]['high'][cn] 57 | lo1 = chrom_highlow['{0}'.format(chrom1)]['low'][cn] 58 | 59 | slc1 = corr1[hi1] - corr1[lo1] 60 | 61 | row = [] 62 | 63 | for jj in range(1,len(chromosomes),2): 64 | chrom2 = chromosomes[jj] 65 | embed2 = chrom_embeds['{0}'.format(chrom2)][cn] 66 | embed2 = embed2 if gpu_caching else to_cuda(embed2) 67 | corr2 = torch.corrcoef(embed2) 68 | 69 | hi2 = chrom_highlow['{0}'.format(chrom2)]['high'][cn] 70 | lo2 = chrom_highlow['{0}'.format(chrom2)]['low'][cn] 71 | 72 | slc2 = corr2[hi2] - corr2[lo2] 73 | 74 | op = slc1.mean(dim=0)[:,None] * slc2.mean(dim=0)[None] 75 | 76 | opf = op.flatten() 77 | opf = quantile_transform(opf,n_quantiles=1000) 78 | opq = opf.reshape(op.shape) 79 | 80 | row.append(opq) 81 | 82 | row = torch.hstack(row) 83 | inter_matrix.append(row) 84 | 85 | # inter_matrix = torch.from_numpy(np.vstack(inter_matrix)).cuda() 86 | inter_matrix = torch.vstack(inter_matrix) 87 | 88 | L = KMeans(n_clusters=5,n_init=kmeans_init).fit_predict(inter_matrix) 89 | LT = KMeans(n_clusters=5,n_init=kmeans_init).fit_predict(inter_matrix.T) 90 | 91 | cell_labels.append(L.get()) 92 | cell_labels_transpose.append(LT.get()) 93 | # gc.collect() 94 | 95 | cell_labels = np.array(cell_labels) 96 | cell_labels_transpose = np.array(cell_labels_transpose) 97 | 98 | # align using hig_scab 99 | 100 | hig_scab,scAB_chrom,scAB_start = parse_higashi_scab(runtime_args) 101 | 102 | cmap = [] 103 | rmap = [] 104 | chrom_hig = {} 105 | cropped_indices = {} 106 | 107 | data_dir = runtime_args['data_directory'] 108 | 109 | for ii in range(0,len(chromosomes),2): 110 | chrom = chromosomes[ii] 111 | 112 | chrom_indices = pickle.load(open(os.path.join(data_dir,'chrom_indices.pkl'),'rb'))['{0}'.format(chrom)] 113 | scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(runtime_args['chromosomes'][chrom]['integer']))[0] 114 | _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // 500000,chrom_indices,return_indices=True) 115 | scab_indices = scab_chrom_indices[scab_crop] 116 | scghost_indices = chrom_indices[scghost_crop] 117 | cropped_indices['{0}'.format(chrom)] = scghost_indices 118 | 119 | rmap.append( 120 | np.vstack(( 121 | np.ones(len(scghost_indices)) * runtime_args['chromosomes'][chrom]['integer'], 122 | np.arange(len(scghost_indices)), 123 | scghost_crop, 124 | scghost_indices 125 | )).T 126 | ) 127 | 128 | chrom_hig['{0}'.format(chrom)] = hig_scab[:,scab_indices] 129 | 130 | for ii in range(1,len(chromosomes),2): 131 | chrom = chromosomes[ii] 132 | 133 | chrom_indices = pickle.load(open(os.path.join(data_dir,'chrom_indices.pkl'),'rb'))['{0}'.format(chrom)] 134 | scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(runtime_args['chromosomes'][chrom]['integer']))[0] 135 | 136 | _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // 500000,chrom_indices,return_indices=True) 137 | scab_indices = scab_chrom_indices[scab_crop] 138 | scghost_indices = chrom_indices[scghost_crop] 139 | cropped_indices['{0}'.format(chrom)] = scghost_indices 140 | 141 | cmap.append( 142 | np.vstack(( 143 | np.ones(len(scghost_indices)) * runtime_args['chromosomes'][chrom]['integer'], 144 | np.arange(len(scghost_indices)), 145 | scghost_crop, 146 | scghost_indices 147 | )).T 148 | ) 149 | 150 | chrom_hig['{0}'.format(chrom)] = hig_scab[:,scab_indices] 151 | 152 | rmap = np.vstack(rmap) 153 | cmap = np.vstack(cmap) 154 | 155 | pickle.dump(cropped_indices,open(os.path.join(data_dir,'cropped_indices.pkl'),'wb')) 156 | 157 | chrom_sorted_labels = {} 158 | 159 | for chrom in chromosomes: 160 | chrom_sorted_labels['{0}'.format(chrom)] = [] 161 | 162 | for i in bar: 163 | ab = chrom_hig['{0}'.format(chrom)][i] 164 | 165 | m = rmap if runtime_args['chromosomes'][chrom]['integer'] % 2 == 1 else cmap 166 | 167 | idx = np.where(m[:,0] == runtime_args['chromosomes'][chrom]['integer'])[0] 168 | lset = cell_labels if runtime_args['chromosomes'][chrom]['integer'] % 2 == 1 else cell_labels_transpose 169 | 170 | lbls = lset[i,idx] 171 | lbls_ab = np.zeros(5) 172 | 173 | for k in range(5): 174 | ii = np.where(lbls == k)[0] 175 | 176 | lbls_ab[k] = ab[ii].mean() 177 | 178 | lbls_order = lbls_ab.argsort()[::-1] 179 | lbls_sorted = lbls.copy() 180 | 181 | for k in range(5): 182 | lbls_sorted[lbls == lbls_order[k]] = k 183 | 184 | chrom_sorted_labels['{0}'.format(chrom)].append(lbls_sorted) 185 | 186 | chrom_sorted_labels['{0}'.format(chrom)] = np.array(chrom_sorted_labels['{0}'.format(chrom)]) 187 | 188 | pickle.dump(chrom_sorted_labels,open(os.path.join(data_dir,'labels.pkl'),'wb')) -------------------------------------------------------------------------------- /tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Generate JSON config file" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Enter scGHOST settings" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# filepath settings\n", 24 | "schic_directory = \"/directory/of/higashi/imputed/maps\"\n", 25 | "label_info_path = \"/path/to/label_info.pickle\"\n", 26 | "label_info_cell_type_key = \"cluster label\"\n", 27 | "data_directory = \"/directory/to/save/scghost/outputs/\"\n", 28 | "\n", 29 | "NUM_CHROMOSOMES = 22\n", 30 | "chromosomes = {chrom_num : {\n", 31 | " 'adj' : f'chr{chrom_num}_sparse_adj.npy',\n", 32 | " 'imputed' : f'chr{chrom_num}_exp1_nbr_5_impute.hdf5',\n", 33 | " 'integer' : chrom_num,\n", 34 | "} for chrom_num in range(1,NUM_CHROMOSOMES+1)}\n", 35 | "\n", 36 | "chrom_sizes = 'data/hg19.chrom.sizes'\n", 37 | "chrom_indices = None\n", 38 | "embeddings_path = \"/path/to/exp1_0_origin.npy\"\n", 39 | "higashi_scab_path = \"/path/to/higashi/scAB.hdf5\"\n", 40 | "cell_type = None\n", 41 | "\n", 42 | "# hyperparameters\n", 43 | "random_walk_num_walks = 50\n", 44 | "random_walk_ignore_top = 0.02\n", 45 | "random_walk_top_percentile = 0.25\n", 46 | "eps = 1e-8\n", 47 | "num_clusters = 5\n", 48 | "batch_size = 16\n", 49 | "epochs = 5\n", 50 | "resolution = 500000\n", 51 | "neighbor_contacts = False\n", 52 | "kmeans_init = 1\n", 53 | "\n", 54 | "# misc settings\n", 55 | "nearest_neighbor_override = None\n", 56 | "gpu_uniques = True\n", 57 | "cluster_gpu_caching = True" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Generate python dictionary" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "settings_dict = {\n", 74 | " 'schic_directory': schic_directory,\n", 75 | " 'label_info': {\n", 76 | " 'path': label_info_path,\n", 77 | " 'cell_type_key': label_info_cell_type_key,\n", 78 | " },\n", 79 | " 'data_directory': data_directory,\n", 80 | " 'chromosomes': chromosomes,\n", 81 | " 'chrom_sizes': chrom_sizes,\n", 82 | " 'chrom_indices': chrom_indices,\n", 83 | " 'embeddings_path': embeddings_path,\n", 84 | " 'higashi_scab_path': higashi_scab_path,\n", 85 | " 'cell_type': cell_type,\n", 86 | " 'random_walk': {\n", 87 | " 'num_walks': random_walk_num_walks,\n", 88 | " 'ignore_top': random_walk_ignore_top,\n", 89 | " 'top_percentile': random_walk_top_percentile,\n", 90 | " },\n", 91 | " 'epis': eps,\n", 92 | " 'num_clusters': num_clusters,\n", 93 | " 'batch_size': batch_size,\n", 94 | " 'epochs': epochs,\n", 95 | " 'resolution': resolution,\n", 96 | " 'neighbor_contacts': neighbor_contacts,\n", 97 | " 'nearest_neighbor_override': nearest_neighbor_override,\n", 98 | " 'gpu_uniques': gpu_uniques,\n", 99 | " 'cluster_gpu_caching': cluster_gpu_caching,\n", 100 | " 'kmeans_init': kmeans_init,\n", 101 | "}\n", 102 | "\n", 103 | "\n", 104 | "import json \n", 105 | "\n", 106 | "with open(\"tutorial.json\", \"w\") as outfile: \n", 107 | " json_string = json.dumps(settings_dict, indent=4)\n", 108 | " outfile.write(json_string)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Run scGHOST" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "import subprocess\n", 125 | "\n", 126 | "subprocess.call(['python scghost.py --config tutorial.json'],shell=True)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### Format scGHOST output" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 2, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stderr", 143 | "output_type": "stream", 144 | "text": [ 145 | "100%|██████████| 4238/4238 [01:01<00:00, 69.12it/s]\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "import pickle\n", 151 | "import os\n", 152 | "from tqdm import trange\n", 153 | "\n", 154 | "# enter labels.pkl path\n", 155 | "label_filepath = '/mnt/e/data/scghost_pfc_output/publication_results/labels.pkl'\n", 156 | "labels = pickle.load(open(label_filepath,'rb'))\n", 157 | "\n", 158 | "# enter cropped_indices.pkl path\n", 159 | "cropped_indices_filepath = '/mnt/e/data/scghost_pfc_output/publication_results/cropped_indices.pkl'\n", 160 | "cropped_indices = pickle.load(open(cropped_indices_filepath,'rb'))\n", 161 | "\n", 162 | "# enter resolution\n", 163 | "resolution = 500000\n", 164 | "\n", 165 | "# enter bed file output directory\n", 166 | "bed_file_directory = 'bed_files'\n", 167 | "chrom_prefix = 'chr' # change this to '' if chromosomes are labeled chr1,chr2,... instead of 1,2,...\n", 168 | "\n", 169 | "sc_subcompartment_names = ['scA1','scA2','scB1','scB2','scB3'] # default for scGHOST k=5\n", 170 | "\n", 171 | "os.makedirs(bed_file_directory,exist_ok=True)\n", 172 | "\n", 173 | "num_cells = labels[ list( labels.keys() )[0] ].shape[0]\n", 174 | "\n", 175 | "for cell_num in trange(num_cells):\n", 176 | "\n", 177 | " with open(os.path.join(bed_file_directory,f'cell_{cell_num}.bed'),'w') as f:\n", 178 | "\n", 179 | " for chromosome in labels:\n", 180 | "\n", 181 | " annotations = labels[chromosome][cell_num]\n", 182 | "\n", 183 | " for locus in range(len(annotations)):\n", 184 | "\n", 185 | " position = cropped_indices[chromosome][locus]\n", 186 | " annotation = sc_subcompartment_names[ annotations[locus] ]\n", 187 | "\n", 188 | " line = f'{chrom_prefix}{chromosome}\\t{int(position * resolution)}\\t{int((position+1) * resolution)}\\t{annotation}\\n'\n", 189 | " f.write(line)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### Generate scatter plot" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 1, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stderr", 206 | "output_type": "stream", 207 | "text": [ 208 | "/mnt/c/Users/turke/scghost_public/scGHOST/modules/analysis.py:45: RuntimeWarning: invalid value encountered in divide\n", 209 | " pseudo_bulk /= cov[None,:]\n", 210 | "/mnt/c/Users/turke/scghost_public/scGHOST/modules/analysis.py:46: RuntimeWarning: invalid value encountered in divide\n", 211 | " pseudo_bulk /= cov[:,None]\n", 212 | "100%|██████████| 4238/4238 [00:10<00:00, 420.16it/s]\n", 213 | "100%|██████████| 4238/4238 [00:13<00:00, 323.12it/s]\n", 214 | "100%|██████████| 4238/4238 [00:07<00:00, 530.87it/s]\n", 215 | "100%|██████████| 4238/4238 [00:08<00:00, 473.63it/s]\n", 216 | "100%|██████████| 4238/4238 [00:05<00:00, 731.00it/s]\n", 217 | "100%|██████████| 4238/4238 [00:05<00:00, 790.55it/s]\n", 218 | "100%|██████████| 4238/4238 [00:06<00:00, 684.44it/s]\n", 219 | "100%|██████████| 4238/4238 [00:05<00:00, 733.62it/s] \n", 220 | "100%|██████████| 4238/4238 [00:03<00:00, 1174.09it/s]\n", 221 | "100%|██████████| 4238/4238 [00:05<00:00, 818.62it/s] \n", 222 | "100%|██████████| 4238/4238 [00:04<00:00, 904.82it/s] \n", 223 | "100%|██████████| 4238/4238 [00:04<00:00, 966.01it/s] \n", 224 | "100%|██████████| 4238/4238 [00:02<00:00, 1833.86it/s]\n", 225 | "100%|██████████| 4238/4238 [00:03<00:00, 1252.75it/s]\n", 226 | "100%|██████████| 4238/4238 [00:01<00:00, 2557.56it/s]\n", 227 | "100%|██████████| 4238/4238 [00:01<00:00, 2641.86it/s]\n", 228 | "100%|██████████| 4238/4238 [00:01<00:00, 2954.00it/s]\n", 229 | "100%|██████████| 4238/4238 [00:01<00:00, 3383.79it/s]\n", 230 | "100%|██████████| 4238/4238 [00:00<00:00, 4811.81it/s]\n", 231 | "100%|██████████| 4238/4238 [00:01<00:00, 4216.22it/s]\n", 232 | "100%|██████████| 4238/4238 [00:00<00:00, 9060.52it/s]\n", 233 | "100%|██████████| 4238/4238 [00:00<00:00, 10878.97it/s]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "# sort chromosome subcompartments using single cell AB compartments from Higashi\n", 239 | "from modules.analysis import prep_scatterplot\n", 240 | "\n", 241 | "prep_scatterplot(\n", 242 | " '/mnt/e/data/scghost_pfc_output/working_results/',\n", 243 | " '/mnt/e/data/scghost_pfc_output/chrom_indices.pkl',\n", 244 | " '/mnt/e/data/pfc/scAB.hdf5'\n", 245 | ")" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 1, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "name": "stderr", 255 | "output_type": "stream", 256 | "text": [ 257 | "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", 258 | " @numba.jit()\n", 259 | "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", 260 | " @numba.jit()\n", 261 | "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", 262 | " @numba.jit()\n", 263 | "/home/kyle/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 264 | " from .autonotebook import tqdm as notebook_tqdm\n", 265 | "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", 266 | " @numba.jit()\n" 267 | ] 268 | }, 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "['L2/3' 'L2/3' 'L2/3' ... 'L4' 'Astro' 'ODC']\n", 274 | "\n" 275 | ] 276 | }, 277 | { 278 | "name": "stderr", 279 | "output_type": "stream", 280 | "text": [ 281 | "100%|██████████| 4238/4238 [00:04<00:00, 984.72it/s] \n" 282 | ] 283 | }, 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "(4238, 5432)\n", 289 | "(4238, 5432)\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "from modules.analysis import sc_compartment2embedding\n", 295 | "import seaborn as sns\n", 296 | "import matplotlib.pyplot as plt\n", 297 | "\n", 298 | "(vec,label,pal) = sc_compartment2embedding('./tutorial_embeds.hdf5','/mnt/e/data/pfc/','tutorial_scatter.pdf')\n", 299 | "\n", 300 | "fig, ax = plt.subplots(figsize=(7, 5))\n", 301 | "sns.scatterplot(x=vec[:, 0], y=vec[:, 1], hue=label, linewidth=0, s=2, alpha=1.0, palette=pal)\n", 302 | "\n", 303 | "handles, labels = ax.get_legend_handles_labels()\n", 304 | "labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))\n", 305 | "ax.legend(handles=handles, labels=labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "dr", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.10.11" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 2 330 | } 331 | -------------------------------------------------------------------------------- /data/hg38.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 248956422 2 | chr2 242193529 3 | chr3 198295559 4 | chr4 190214555 5 | chr5 181538259 6 | chr6 170805979 7 | chr7 159345973 8 | chr8 145138636 9 | chr9 138394717 10 | chr10 133797422 11 | chr11 135086622 12 | chr12 133275309 13 | chr13 114364328 14 | chr14 107043718 15 | chr15 101991189 16 | chr16 90338345 17 | chr17 83257441 18 | chr18 80373285 19 | chr19 58617616 20 | chr20 64444167 21 | chr21 46709983 22 | chr22 50818468 23 | chrX 156040895 24 | chrY 57227415 25 | chrM 16569 26 | chr11_KI270721v1_random 100316 27 | chr14_GL000009v2_random 201709 28 | chr14_GL000225v1_random 211173 29 | chr14_KI270722v1_random 194050 30 | chr14_GL000194v1_random 191469 31 | chr14_KI270723v1_random 38115 32 | chr14_KI270724v1_random 39555 33 | chr14_KI270725v1_random 172810 34 | chr14_KI270726v1_random 43739 35 | chr15_KI270727v1_random 448248 36 | chr16_KI270728v1_random 1872759 37 | chr17_GL000205v2_random 185591 38 | chr17_KI270729v1_random 280839 39 | chr17_KI270730v1_random 112551 40 | chr1_KI270706v1_random 175055 41 | chr1_KI270707v1_random 32032 42 | chr1_KI270708v1_random 127682 43 | chr1_KI270709v1_random 66860 44 | chr1_KI270710v1_random 40176 45 | chr1_KI270711v1_random 42210 46 | chr1_KI270712v1_random 176043 47 | chr1_KI270713v1_random 40745 48 | chr1_KI270714v1_random 41717 49 | chr22_KI270731v1_random 150754 50 | chr22_KI270732v1_random 41543 51 | chr22_KI270733v1_random 179772 52 | chr22_KI270734v1_random 165050 53 | chr22_KI270735v1_random 42811 54 | chr22_KI270736v1_random 181920 55 | chr22_KI270737v1_random 103838 56 | chr22_KI270738v1_random 99375 57 | chr22_KI270739v1_random 73985 58 | chr2_KI270715v1_random 161471 59 | chr2_KI270716v1_random 153799 60 | chr3_GL000221v1_random 155397 61 | chr4_GL000008v2_random 209709 62 | chr5_GL000208v1_random 92689 63 | chr9_KI270717v1_random 40062 64 | chr9_KI270718v1_random 38054 65 | chr9_KI270719v1_random 176845 66 | chr9_KI270720v1_random 39050 67 | chr1_KI270762v1_alt 354444 68 | chr1_KI270766v1_alt 256271 69 | chr1_KI270760v1_alt 109528 70 | chr1_KI270765v1_alt 185285 71 | chr1_GL383518v1_alt 182439 72 | chr1_GL383519v1_alt 110268 73 | chr1_GL383520v2_alt 366580 74 | chr1_KI270764v1_alt 50258 75 | chr1_KI270763v1_alt 911658 76 | chr1_KI270759v1_alt 425601 77 | chr1_KI270761v1_alt 165834 78 | chr2_KI270770v1_alt 136240 79 | chr2_KI270773v1_alt 70887 80 | chr2_KI270774v1_alt 223625 81 | chr2_KI270769v1_alt 120616 82 | chr2_GL383521v1_alt 143390 83 | chr2_KI270772v1_alt 133041 84 | chr2_KI270775v1_alt 138019 85 | chr2_KI270771v1_alt 110395 86 | chr2_KI270768v1_alt 110099 87 | chr2_GL582966v2_alt 96131 88 | chr2_GL383522v1_alt 123821 89 | chr2_KI270776v1_alt 174166 90 | chr2_KI270767v1_alt 161578 91 | chr3_JH636055v2_alt 173151 92 | chr3_KI270783v1_alt 109187 93 | chr3_KI270780v1_alt 224108 94 | chr3_GL383526v1_alt 180671 95 | chr3_KI270777v1_alt 173649 96 | chr3_KI270778v1_alt 248252 97 | chr3_KI270781v1_alt 113034 98 | chr3_KI270779v1_alt 205312 99 | chr3_KI270782v1_alt 162429 100 | chr3_KI270784v1_alt 184404 101 | chr4_KI270790v1_alt 220246 102 | chr4_GL383528v1_alt 376187 103 | chr4_KI270787v1_alt 111943 104 | chr4_GL000257v2_alt 586476 105 | chr4_KI270788v1_alt 158965 106 | chr4_GL383527v1_alt 164536 107 | chr4_KI270785v1_alt 119912 108 | chr4_KI270789v1_alt 205944 109 | chr4_KI270786v1_alt 244096 110 | chr5_KI270793v1_alt 126136 111 | chr5_KI270792v1_alt 179043 112 | chr5_KI270791v1_alt 195710 113 | chr5_GL383532v1_alt 82728 114 | chr5_GL949742v1_alt 226852 115 | chr5_KI270794v1_alt 164558 116 | chr5_GL339449v2_alt 1612928 117 | chr5_GL383530v1_alt 101241 118 | chr5_KI270796v1_alt 172708 119 | chr5_GL383531v1_alt 173459 120 | chr5_KI270795v1_alt 131892 121 | chr6_GL000250v2_alt 4672374 122 | chr6_KI270800v1_alt 175808 123 | chr6_KI270799v1_alt 152148 124 | chr6_GL383533v1_alt 124736 125 | chr6_KI270801v1_alt 870480 126 | chr6_KI270802v1_alt 75005 127 | chr6_KB021644v2_alt 185823 128 | chr6_KI270797v1_alt 197536 129 | chr6_KI270798v1_alt 271782 130 | chr7_KI270804v1_alt 157952 131 | chr7_KI270809v1_alt 209586 132 | chr7_KI270806v1_alt 158166 133 | chr7_GL383534v2_alt 119183 134 | chr7_KI270803v1_alt 1111570 135 | chr7_KI270808v1_alt 271455 136 | chr7_KI270807v1_alt 126434 137 | chr7_KI270805v1_alt 209988 138 | chr8_KI270818v1_alt 145606 139 | chr8_KI270812v1_alt 282736 140 | chr8_KI270811v1_alt 292436 141 | chr8_KI270821v1_alt 985506 142 | chr8_KI270813v1_alt 300230 143 | chr8_KI270822v1_alt 624492 144 | chr8_KI270814v1_alt 141812 145 | chr8_KI270810v1_alt 374415 146 | chr8_KI270819v1_alt 133535 147 | chr8_KI270820v1_alt 36640 148 | chr8_KI270817v1_alt 158983 149 | chr8_KI270816v1_alt 305841 150 | chr8_KI270815v1_alt 132244 151 | chr9_GL383539v1_alt 162988 152 | chr9_GL383540v1_alt 71551 153 | chr9_GL383541v1_alt 171286 154 | chr9_GL383542v1_alt 60032 155 | chr9_KI270823v1_alt 439082 156 | chr10_GL383545v1_alt 179254 157 | chr10_KI270824v1_alt 181496 158 | chr10_GL383546v1_alt 309802 159 | chr10_KI270825v1_alt 188315 160 | chr11_KI270832v1_alt 210133 161 | chr11_KI270830v1_alt 177092 162 | chr11_KI270831v1_alt 296895 163 | chr11_KI270829v1_alt 204059 164 | chr11_GL383547v1_alt 154407 165 | chr11_JH159136v1_alt 200998 166 | chr11_JH159137v1_alt 191409 167 | chr11_KI270827v1_alt 67707 168 | chr11_KI270826v1_alt 186169 169 | chr12_GL877875v1_alt 167313 170 | chr12_GL877876v1_alt 408271 171 | chr12_KI270837v1_alt 40090 172 | chr12_GL383549v1_alt 120804 173 | chr12_KI270835v1_alt 238139 174 | chr12_GL383550v2_alt 169178 175 | chr12_GL383552v1_alt 138655 176 | chr12_GL383553v2_alt 152874 177 | chr12_KI270834v1_alt 119498 178 | chr12_GL383551v1_alt 184319 179 | chr12_KI270833v1_alt 76061 180 | chr12_KI270836v1_alt 56134 181 | chr13_KI270840v1_alt 191684 182 | chr13_KI270839v1_alt 180306 183 | chr13_KI270843v1_alt 103832 184 | chr13_KI270841v1_alt 169134 185 | chr13_KI270838v1_alt 306913 186 | chr13_KI270842v1_alt 37287 187 | chr14_KI270844v1_alt 322166 188 | chr14_KI270847v1_alt 1511111 189 | chr14_KI270845v1_alt 180703 190 | chr14_KI270846v1_alt 1351393 191 | chr15_KI270852v1_alt 478999 192 | chr15_KI270851v1_alt 263054 193 | chr15_KI270848v1_alt 327382 194 | chr15_GL383554v1_alt 296527 195 | chr15_KI270849v1_alt 244917 196 | chr15_GL383555v2_alt 388773 197 | chr15_KI270850v1_alt 430880 198 | chr16_KI270854v1_alt 134193 199 | chr16_KI270856v1_alt 63982 200 | chr16_KI270855v1_alt 232857 201 | chr16_KI270853v1_alt 2659700 202 | chr16_GL383556v1_alt 192462 203 | chr16_GL383557v1_alt 89672 204 | chr17_GL383563v3_alt 375691 205 | chr17_KI270862v1_alt 391357 206 | chr17_KI270861v1_alt 196688 207 | chr17_KI270857v1_alt 2877074 208 | chr17_JH159146v1_alt 278131 209 | chr17_JH159147v1_alt 70345 210 | chr17_GL383564v2_alt 133151 211 | chr17_GL000258v2_alt 1821992 212 | chr17_GL383565v1_alt 223995 213 | chr17_KI270858v1_alt 235827 214 | chr17_KI270859v1_alt 108763 215 | chr17_GL383566v1_alt 90219 216 | chr17_KI270860v1_alt 178921 217 | chr18_KI270864v1_alt 111737 218 | chr18_GL383567v1_alt 289831 219 | chr18_GL383570v1_alt 164789 220 | chr18_GL383571v1_alt 198278 221 | chr18_GL383568v1_alt 104552 222 | chr18_GL383569v1_alt 167950 223 | chr18_GL383572v1_alt 159547 224 | chr18_KI270863v1_alt 167999 225 | chr19_KI270868v1_alt 61734 226 | chr19_KI270865v1_alt 52969 227 | chr19_GL383573v1_alt 385657 228 | chr19_GL383575v2_alt 170222 229 | chr19_GL383576v1_alt 188024 230 | chr19_GL383574v1_alt 155864 231 | chr19_KI270866v1_alt 43156 232 | chr19_KI270867v1_alt 233762 233 | chr19_GL949746v1_alt 987716 234 | chr20_GL383577v2_alt 128386 235 | chr20_KI270869v1_alt 118774 236 | chr20_KI270871v1_alt 58661 237 | chr20_KI270870v1_alt 183433 238 | chr21_GL383578v2_alt 63917 239 | chr21_KI270874v1_alt 166743 240 | chr21_KI270873v1_alt 143900 241 | chr21_GL383579v2_alt 201197 242 | chr21_GL383580v2_alt 74653 243 | chr21_GL383581v2_alt 116689 244 | chr21_KI270872v1_alt 82692 245 | chr22_KI270875v1_alt 259914 246 | chr22_KI270878v1_alt 186262 247 | chr22_KI270879v1_alt 304135 248 | chr22_KI270876v1_alt 263666 249 | chr22_KI270877v1_alt 101331 250 | chr22_GL383583v2_alt 96924 251 | chr22_GL383582v2_alt 162811 252 | chrX_KI270880v1_alt 284869 253 | chrX_KI270881v1_alt 144206 254 | chr19_KI270882v1_alt 248807 255 | chr19_KI270883v1_alt 170399 256 | chr19_KI270884v1_alt 157053 257 | chr19_KI270885v1_alt 171027 258 | chr19_KI270886v1_alt 204239 259 | chr19_KI270887v1_alt 209512 260 | chr19_KI270888v1_alt 155532 261 | chr19_KI270889v1_alt 170698 262 | chr19_KI270890v1_alt 184499 263 | chr19_KI270891v1_alt 170680 264 | chr1_KI270892v1_alt 162212 265 | chr2_KI270894v1_alt 214158 266 | chr2_KI270893v1_alt 161218 267 | chr3_KI270895v1_alt 162896 268 | chr4_KI270896v1_alt 378547 269 | chr5_KI270897v1_alt 1144418 270 | chr5_KI270898v1_alt 130957 271 | chr6_GL000251v2_alt 4795265 272 | chr7_KI270899v1_alt 190869 273 | chr8_KI270901v1_alt 136959 274 | chr8_KI270900v1_alt 318687 275 | chr11_KI270902v1_alt 106711 276 | chr11_KI270903v1_alt 214625 277 | chr12_KI270904v1_alt 572349 278 | chr15_KI270906v1_alt 196384 279 | chr15_KI270905v1_alt 5161414 280 | chr17_KI270907v1_alt 137721 281 | chr17_KI270910v1_alt 157099 282 | chr17_KI270909v1_alt 325800 283 | chr17_JH159148v1_alt 88070 284 | chr17_KI270908v1_alt 1423190 285 | chr18_KI270912v1_alt 174061 286 | chr18_KI270911v1_alt 157710 287 | chr19_GL949747v2_alt 729520 288 | chr22_KB663609v1_alt 74013 289 | chrX_KI270913v1_alt 274009 290 | chr19_KI270914v1_alt 205194 291 | chr19_KI270915v1_alt 170665 292 | chr19_KI270916v1_alt 184516 293 | chr19_KI270917v1_alt 190932 294 | chr19_KI270918v1_alt 123111 295 | chr19_KI270919v1_alt 170701 296 | chr19_KI270920v1_alt 198005 297 | chr19_KI270921v1_alt 282224 298 | chr19_KI270922v1_alt 187935 299 | chr19_KI270923v1_alt 189352 300 | chr3_KI270924v1_alt 166540 301 | chr4_KI270925v1_alt 555799 302 | chr6_GL000252v2_alt 4604811 303 | chr8_KI270926v1_alt 229282 304 | chr11_KI270927v1_alt 218612 305 | chr19_GL949748v2_alt 1064304 306 | chr22_KI270928v1_alt 176103 307 | chr19_KI270929v1_alt 186203 308 | chr19_KI270930v1_alt 200773 309 | chr19_KI270931v1_alt 170148 310 | chr19_KI270932v1_alt 215732 311 | chr19_KI270933v1_alt 170537 312 | chr19_GL000209v2_alt 177381 313 | chr3_KI270934v1_alt 163458 314 | chr6_GL000253v2_alt 4677643 315 | chr19_GL949749v2_alt 1091841 316 | chr3_KI270935v1_alt 197351 317 | chr6_GL000254v2_alt 4827813 318 | chr19_GL949750v2_alt 1066390 319 | chr3_KI270936v1_alt 164170 320 | chr6_GL000255v2_alt 4606388 321 | chr19_GL949751v2_alt 1002683 322 | chr3_KI270937v1_alt 165607 323 | chr6_GL000256v2_alt 4929269 324 | chr19_GL949752v1_alt 987100 325 | chr6_KI270758v1_alt 76752 326 | chr19_GL949753v2_alt 796479 327 | chr19_KI270938v1_alt 1066800 328 | chrUn_KI270302v1 2274 329 | chrUn_KI270304v1 2165 330 | chrUn_KI270303v1 1942 331 | chrUn_KI270305v1 1472 332 | chrUn_KI270322v1 21476 333 | chrUn_KI270320v1 4416 334 | chrUn_KI270310v1 1201 335 | chrUn_KI270316v1 1444 336 | chrUn_KI270315v1 2276 337 | chrUn_KI270312v1 998 338 | chrUn_KI270311v1 12399 339 | chrUn_KI270317v1 37690 340 | chrUn_KI270412v1 1179 341 | chrUn_KI270411v1 2646 342 | chrUn_KI270414v1 2489 343 | chrUn_KI270419v1 1029 344 | chrUn_KI270418v1 2145 345 | chrUn_KI270420v1 2321 346 | chrUn_KI270424v1 2140 347 | chrUn_KI270417v1 2043 348 | chrUn_KI270422v1 1445 349 | chrUn_KI270423v1 981 350 | chrUn_KI270425v1 1884 351 | chrUn_KI270429v1 1361 352 | chrUn_KI270442v1 392061 353 | chrUn_KI270466v1 1233 354 | chrUn_KI270465v1 1774 355 | chrUn_KI270467v1 3920 356 | chrUn_KI270435v1 92983 357 | chrUn_KI270438v1 112505 358 | chrUn_KI270468v1 4055 359 | chrUn_KI270510v1 2415 360 | chrUn_KI270509v1 2318 361 | chrUn_KI270518v1 2186 362 | chrUn_KI270508v1 1951 363 | chrUn_KI270516v1 1300 364 | chrUn_KI270512v1 22689 365 | chrUn_KI270519v1 138126 366 | chrUn_KI270522v1 5674 367 | chrUn_KI270511v1 8127 368 | chrUn_KI270515v1 6361 369 | chrUn_KI270507v1 5353 370 | chrUn_KI270517v1 3253 371 | chrUn_KI270529v1 1899 372 | chrUn_KI270528v1 2983 373 | chrUn_KI270530v1 2168 374 | chrUn_KI270539v1 993 375 | chrUn_KI270538v1 91309 376 | chrUn_KI270544v1 1202 377 | chrUn_KI270548v1 1599 378 | chrUn_KI270583v1 1400 379 | chrUn_KI270587v1 2969 380 | chrUn_KI270580v1 1553 381 | chrUn_KI270581v1 7046 382 | chrUn_KI270579v1 31033 383 | chrUn_KI270589v1 44474 384 | chrUn_KI270590v1 4685 385 | chrUn_KI270584v1 4513 386 | chrUn_KI270582v1 6504 387 | chrUn_KI270588v1 6158 388 | chrUn_KI270593v1 3041 389 | chrUn_KI270591v1 5796 390 | chrUn_KI270330v1 1652 391 | chrUn_KI270329v1 1040 392 | chrUn_KI270334v1 1368 393 | chrUn_KI270333v1 2699 394 | chrUn_KI270335v1 1048 395 | chrUn_KI270338v1 1428 396 | chrUn_KI270340v1 1428 397 | chrUn_KI270336v1 1026 398 | chrUn_KI270337v1 1121 399 | chrUn_KI270363v1 1803 400 | chrUn_KI270364v1 2855 401 | chrUn_KI270362v1 3530 402 | chrUn_KI270366v1 8320 403 | chrUn_KI270378v1 1048 404 | chrUn_KI270379v1 1045 405 | chrUn_KI270389v1 1298 406 | chrUn_KI270390v1 2387 407 | chrUn_KI270387v1 1537 408 | chrUn_KI270395v1 1143 409 | chrUn_KI270396v1 1880 410 | chrUn_KI270388v1 1216 411 | chrUn_KI270394v1 970 412 | chrUn_KI270386v1 1788 413 | chrUn_KI270391v1 1484 414 | chrUn_KI270383v1 1750 415 | chrUn_KI270393v1 1308 416 | chrUn_KI270384v1 1658 417 | chrUn_KI270392v1 971 418 | chrUn_KI270381v1 1930 419 | chrUn_KI270385v1 990 420 | chrUn_KI270382v1 4215 421 | chrUn_KI270376v1 1136 422 | chrUn_KI270374v1 2656 423 | chrUn_KI270372v1 1650 424 | chrUn_KI270373v1 1451 425 | chrUn_KI270375v1 2378 426 | chrUn_KI270371v1 2805 427 | chrUn_KI270448v1 7992 428 | chrUn_KI270521v1 7642 429 | chrUn_GL000195v1 182896 430 | chrUn_GL000219v1 179198 431 | chrUn_GL000220v1 161802 432 | chrUn_GL000224v1 179693 433 | chrUn_KI270741v1 157432 434 | chrUn_GL000226v1 15008 435 | chrUn_GL000213v1 164239 436 | chrUn_KI270743v1 210658 437 | chrUn_KI270744v1 168472 438 | chrUn_KI270745v1 41891 439 | chrUn_KI270746v1 66486 440 | chrUn_KI270747v1 198735 441 | chrUn_KI270748v1 93321 442 | chrUn_KI270749v1 158759 443 | chrUn_KI270750v1 148850 444 | chrUn_KI270751v1 150742 445 | chrUn_KI270752v1 27745 446 | chrUn_KI270753v1 62944 447 | chrUn_KI270754v1 40191 448 | chrUn_KI270755v1 36723 449 | chrUn_KI270756v1 79590 450 | chrUn_KI270757v1 71251 451 | chrUn_GL000214v1 137718 452 | chrUn_KI270742v1 186739 453 | chrUn_GL000216v2 176608 454 | chrUn_GL000218v1 161147 455 | chrY_KI270740v1_random 37240 456 | --------------------------------------------------------------------------------