├── scghost_overview.png
├── utilities
    ├── gpu.py
    ├── chrom_sizes.py
    ├── helper.py
    └── parsers.py
├── LICENSE
├── data
    ├── mm10.chrom.sizes
    ├── hg19.chrom.sizes
    └── hg38.chrom.sizes
├── modules
    ├── postprocessing.py
    ├── random_walk.py
    ├── preprocessing.py
    ├── embedding.py
    ├── analysis.py
    └── clustering.py
├── .gitignore
├── sample_configs
    ├── config_wtc.json
    ├── config_GM12878.json
    └── config_pfc.json
├── config.json
├── requirements.txt
├── scghost.py
├── README.md
└── tutorial.ipynb


/scghost_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ma-compbio/scGHOST/HEAD/scghost_overview.png


--------------------------------------------------------------------------------
/utilities/gpu.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | def to_cuda(x):
4 |     
5 |     if torch.cuda.is_available():
6 |         return x.cuda()
7 |     
8 |     return x


--------------------------------------------------------------------------------
/utilities/chrom_sizes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def chrom_sizes(f,length=np.inf):
 4 |     data = open(f,'r')
 5 |     
 6 |     sizes = {}
 7 |     
 8 |     for line in data:
 9 |         ldata = line.split()
10 |         
11 |         if len(ldata[0]) > length:
12 |             continue
13 |             
14 |         sizes[ldata[0]] = int(ldata[1])
15 | 
16 |     return sizes


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ma Lab at CMU
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/mm10.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	195471971
 2 | chr2	182113224
 3 | chr3	160039680
 4 | chr4	156508116
 5 | chr5	151834684
 6 | chr6	149736546
 7 | chr7	145441459
 8 | chr8	129401213
 9 | chr9	124595110
10 | chr10	130694993
11 | chr11	122082543
12 | chr12	120129022
13 | chr13	120421639
14 | chr14	124902244
15 | chr15	104043685
16 | chr16	98207768
17 | chr17	94987271
18 | chr18	90702639
19 | chr19	61431566
20 | chrX	171031299
21 | chrY	91744698
22 | chrM	16299
23 | chr1_GL456210_random	169725
24 | chr1_GL456211_random	241735
25 | chr1_GL456212_random	153618
26 | chr1_GL456213_random	39340
27 | chr1_GL456221_random	206961
28 | chr4_GL456216_random	66673
29 | chr4_GL456350_random	227966
30 | chr4_JH584292_random	14945
31 | chr4_JH584293_random	207968
32 | chr4_JH584294_random	191905
33 | chr4_JH584295_random	1976
34 | chr5_GL456354_random	195993
35 | chr5_JH584296_random	199368
36 | chr5_JH584297_random	205776
37 | chr5_JH584298_random	184189
38 | chr5_JH584299_random	953012
39 | chr7_GL456219_random	175968
40 | chrUn_GL456239	40056
41 | chrUn_GL456359	22974
42 | chrUn_GL456360	31704
43 | chrUn_GL456366	47073
44 | chrUn_GL456367	42057
45 | chrUn_GL456368	20208
46 | chrUn_GL456370	26764
47 | chrUn_GL456372	28664
48 | chrUn_GL456378	31602
49 | chrUn_GL456379	72385
50 | chrUn_GL456381	25871
51 | chrUn_GL456382	23158
52 | chrUn_GL456383	38659
53 | chrUn_GL456385	35240
54 | chrUn_GL456387	24685
55 | chrUn_GL456389	28772
56 | chrUn_GL456390	24668
57 | chrUn_GL456392	23629
58 | chrUn_GL456393	55711
59 | chrUn_GL456394	24323
60 | chrUn_GL456396	21240
61 | chrUn_JH584304	114452
62 | chrX_GL456233_random	336933
63 | chrY_JH584300_random	182347
64 | chrY_JH584301_random	259875
65 | chrY_JH584302_random	155838
66 | chrY_JH584303_random	158099


--------------------------------------------------------------------------------
/data/hg19.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	249250621
 2 | chr2	243199373
 3 | chr3	198022430
 4 | chr4	191154276
 5 | chr5	180915260
 6 | chr6	171115067
 7 | chr7	159138663
 8 | chrX	155270560
 9 | chr8	146364022
10 | chr9	141213431
11 | chr10	135534747
12 | chr11	135006516
13 | chr12	133851895
14 | chr13	115169878
15 | chr14	107349540
16 | chr15	102531392
17 | chr16	90354753
18 | chr17	81195210
19 | chr18	78077248
20 | chr20	63025520
21 | chrY	59373566
22 | chr19	59128983
23 | chr22	51304566
24 | chr21	48129895
25 | chr6_ssto_hap7	4928567
26 | chr6_mcf_hap5	4833398
27 | chr6_cox_hap2	4795371
28 | chr6_mann_hap4	4683263
29 | chr6_apd_hap1	4622290
30 | chr6_qbl_hap6	4611984
31 | chr6_dbb_hap3	4610396
32 | chr17_ctg5_hap1	1680828
33 | chr4_ctg9_hap1	590426
34 | chr1_gl000192_random	547496
35 | chrUn_gl000225	211173
36 | chr4_gl000194_random	191469
37 | chr4_gl000193_random	189789
38 | chr9_gl000200_random	187035
39 | chrUn_gl000222	186861
40 | chrUn_gl000212	186858
41 | chr7_gl000195_random	182896
42 | chrUn_gl000223	180455
43 | chrUn_gl000224	179693
44 | chrUn_gl000219	179198
45 | chr17_gl000205_random	174588
46 | chrUn_gl000215	172545
47 | chrUn_gl000216	172294
48 | chrUn_gl000217	172149
49 | chr9_gl000199_random	169874
50 | chrUn_gl000211	166566
51 | chrUn_gl000213	164239
52 | chrUn_gl000220	161802
53 | chrUn_gl000218	161147
54 | chr19_gl000209_random	159169
55 | chrUn_gl000221	155397
56 | chrUn_gl000214	137718
57 | chrUn_gl000228	129120
58 | chrUn_gl000227	128374
59 | chr1_gl000191_random	106433
60 | chr19_gl000208_random	92689
61 | chr9_gl000198_random	90085
62 | chr17_gl000204_random	81310
63 | chrUn_gl000233	45941
64 | chrUn_gl000237	45867
65 | chrUn_gl000230	43691
66 | chrUn_gl000242	43523
67 | chrUn_gl000243	43341
68 | chrUn_gl000241	42152
69 | chrUn_gl000236	41934
70 | chrUn_gl000240	41933
71 | chr17_gl000206_random	41001
72 | chrUn_gl000232	40652
73 | chrUn_gl000234	40531
74 | chr11_gl000202_random	40103
75 | chrUn_gl000238	39939
76 | chrUn_gl000244	39929
77 | chrUn_gl000248	39786
78 | chr8_gl000196_random	38914
79 | chrUn_gl000249	38502
80 | chrUn_gl000246	38154
81 | chr17_gl000203_random	37498
82 | chr8_gl000197_random	37175
83 | chrUn_gl000245	36651
84 | chrUn_gl000247	36422
85 | chr9_gl000201_random	36148
86 | chrUn_gl000235	34474
87 | chrUn_gl000239	33824
88 | chr21_gl000210_random	27682
89 | chrUn_gl000231	27386
90 | chrUn_gl000229	19913
91 | chrM	16571
92 | chrUn_gl000226	15008
93 | chr18_gl000207_random	4262
94 | 


--------------------------------------------------------------------------------
/modules/postprocessing.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import numpy as np
 4 | import torch
 5 | # calibration
 6 | from tqdm.auto import trange
 7 | 
 8 | # modified function to return calibrated pairs/labels
 9 | def post_process_samples(sample_data,OEMs,nearest_neighbors,neighbor_contacts=True):
10 |     
11 |     all_cell_chrom_samples = sample_data
12 | 
13 |     pos_negs = [1,-1]
14 | 
15 |     all_continuous_samples = []
16 | 
17 |     num_cells = len(OEMs)
18 |     
19 |     for n in trange(num_cells, desc='post processing'):
20 |         
21 |         continuous_intra_samples = []
22 |         all_cell_labels = all_cell_chrom_samples[n]
23 |         
24 |         for pn in pos_negs:
25 | 
26 |             pos_neg_intra_idx = np.where((all_cell_labels == pn))[0]
27 |             chrm_samples = all_cell_chrom_samples[n][pos_neg_intra_idx]
28 | 
29 |             adjusted_chrm_samples = np.zeros((len(chrm_samples),3))
30 |             chrm_samples = chrm_samples[:,:2].astype(int)#.cpu().numpy().astype(int)
31 | 
32 |             chrlen = len(OEMs[n])
33 | 
34 |             chrm_linear_samples = chrm_samples[:,0] * chrlen + chrm_samples[:,1]
35 |             
36 |             ms_flattened = OEMs[nearest_neighbors[n]].reshape(nearest_neighbors.shape[1],-1)
37 |             for i in range(ms_flattened.shape[0]):
38 |                 ms_flattened[i][ms_flattened[i] > 0] = ms_flattened[i][ms_flattened[i] > 0] / np.quantile(ms_flattened[i][ms_flattened[i] > 0],0.975)
39 | 
40 |                 if torch.sum(ms_flattened[i] <= 0) > 0:
41 |                     ms_flattened[i][ms_flattened[i] <= 0] = -(ms_flattened[i][ms_flattened[i] <= 0] / np.quantile(ms_flattened[i][ms_flattened[i] <= 0],0.025))
42 |                 
43 |                 ms_flattened[i][ms_flattened[i] > 1] = 1
44 |                 ms_flattened[i][ms_flattened[i] < -1] = -1
45 |                 
46 |             c_flattened = ms_flattened[:,chrm_linear_samples].mean(dim=0) if neighbor_contacts else ms_flattened[0,chrm_linear_samples]
47 |             adjusted_chrm_samples[:,:2] = chrm_samples
48 |             adjusted_chrm_samples[:,2] = c_flattened
49 | 
50 |             adjusted_del_idx = np.where(np.sign(c_flattened) != pn)[0]
51 |             adjusted_chrm_samples = np.delete(adjusted_chrm_samples,adjusted_del_idx,axis=0)
52 |             
53 |             continuous_intra_samples.append(adjusted_chrm_samples)
54 | 
55 |         continuous_intra_samples = torch.tensor(np.concatenate(continuous_intra_samples)).float()
56 |         all_continuous_samples.append(continuous_intra_samples)
57 | 
58 |     all_continuous_pairs = [all_continuous_samples[i][:,:2].long() for i in range(len(all_continuous_samples))]
59 |     all_continuous_labels = [all_continuous_samples[i][:,2] for i in range(len(all_continuous_samples))]
60 |     
61 |     return all_continuous_pairs, all_continuous_labels


--------------------------------------------------------------------------------
/utilities/helper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | def to_cuda(x):
 5 |     
 6 |     if torch.cuda.is_available():
 7 |         return x.cuda()
 8 |     
 9 |     return x
10 | 
11 | def get_expected(M,eps=1e-8):
12 |     E = np.zeros_like(M)
13 |     l = len(M)
14 | 
15 |     for i in range(M.shape[0]):
16 |         contacts = np.diag(M,i)
17 |         expected = contacts.sum() / (l-i)
18 |         # expected = np.median(contacts)
19 |         x_diag,y_diag = np.diag_indices(M.shape[0]-i)
20 |         x,y = x_diag,y_diag+i
21 |         E[x,y] = expected
22 | 
23 |     E += E.T
24 |     E = np.nan_to_num(E) + eps
25 |     
26 |     return E
27 |     
28 | def get_oe_matrix(M):
29 |     E = get_expected(M)
30 |     oe = np.nan_to_num(M / E)
31 |     np.fill_diagonal(oe,1)
32 |     
33 |     return oe
34 | 
35 | def random_sample(p, size, neg=False, normed=False):
36 | 
37 |     if not normed:
38 |         p_ = p / torch.sum(p, dim=-1, keepdim=True)
39 |     else:
40 |         p_ = p
41 |     
42 |     # rg = np.random.default_rng()
43 | 
44 |     random_num = torch.rand(p_.shape,device=p.device)
45 |     # random_num /= torch.sum(random_num, dim=-1, keepdim=True)
46 |     
47 |     diff = random_num - p_
48 | 
49 |     # k = size
50 |     sampled_weights,sampled_idx = torch.topk(diff, size, dim=-1, largest=neg)
51 |     sampled_weights = sampled_weights[..., :size] if not neg else sampled_weights[..., -size:]
52 |     sampled_idx = sampled_idx[..., :size] if not neg else sampled_idx[..., -size:]
53 |     
54 |     return sampled_weights,sampled_idx
55 | 
56 | def random_sample_sorted(p, size, top=None, neg=False, normed=False):
57 |     
58 |     if top is not None:
59 |         p = p[...,-top:] if not neg else p[...,:top]
60 |         
61 | 
62 |     if not normed:
63 |         p_ = p / torch.sum(p, dim=-1, keepdim=True)
64 |     else:
65 |         p_ = p
66 |     
67 |     # rg = np.random.default_rng()
68 | 
69 |     random_num = torch.rand(p_.shape,device=p.device)
70 |     # random_num /= torch.sum(random_num, dim=-1, keepdim=True)
71 | 
72 |     diff = random_num - p_
73 | 
74 |     # k = size
75 |     sampled_weights,sampled_idx = torch.topk(diff, size, dim=-1, largest=neg)
76 |     sampled_weights = sampled_weights[..., :size]# if not neg else sampled_weights[..., -size:]
77 |     sampled_idx = sampled_idx[..., :size]# if not neg else sampled_idx[..., -size:]
78 |     
79 |     return sampled_weights,sampled_idx
80 | 
81 | def random_sample_np(p, size, normed=False):
82 |     if not normed:
83 |         p_ = p / np.sum(p, axis=-1, keepdims=True)
84 |     else:
85 |         p_ = p
86 |     rg = np.random.default_rng()
87 |     random_num = rg.random(p_.shape)
88 |     random_num /= np.sum(random_num, axis=1, keepdims=True)
89 |     diff = random_num - p_
90 | 
91 |     # k = size
92 |     sampled_idx = np.argpartition(diff, size, axis=1)[..., :size]
93 |     return sampled_idx


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | .DS_Store
154 | .idea/inspectionProfiles/profiles_settings.xml
155 | .idea/inspectionProfiles/Project_Default.xml
156 | .idea/modules.xml
157 | .idea/scghost-dev.iml
158 | .idea/vcs.xml
159 | 
160 | *.hdf5
161 | *.npy
162 | *.pdf
163 | tutorial.json
164 | bed_files/


--------------------------------------------------------------------------------
/modules/random_walk.py:
--------------------------------------------------------------------------------
  1 | # parallelized constrained random walks on the GPU with random walks to inter corr matrix
  2 | import os
  3 | import torch
  4 | import pickle
  5 | import gc
  6 | import numpy as np
  7 | 
  8 | from utilities.gpu import to_cuda
  9 | from utilities.parsers import parse_nearest_neighbors
 10 | from utilities.helper import random_sample
 11 | from modules.preprocessing import parse_chromosomes
 12 | from torch.nn import functional as F
 13 | from tqdm import trange
 14 | 
 15 | def sample_chrom(chrom_num,OEMs,cell_range,nearest_neighbors,num_walks=50,use_breakpoint=False):
 16 |     
 17 |     all_cell_chrom_samples = []
 18 |     layered_maps = OEMs[nearest_neighbors[cell_range]]
 19 | 
 20 |     for cnum in trange(len(cell_range)):
 21 |         chrm_offset = 0
 22 | 
 23 |         # m = to_cuda(torch.tensor(torch.nan_to_num(layered_maps[cnum])).float())
 24 |         m = to_cuda(layered_maps[cnum].float()) # cast from bfloat16 to float32 for precision with sorting
 25 | 
 26 |         bpt = 0
 27 |         
 28 |         if use_breakpoint:
 29 |             bpt = len(m) // 2
 30 |             m = m[:bpt,bpt:]
 31 |         
 32 |         all_samples = []
 33 | 
 34 |         num_top = int(m.shape[1] * 0.25)
 35 | 
 36 |         sorted_slc_w = torch.zeros_like(m)
 37 |         sorted_slc_i = torch.zeros_like(m)
 38 |         sorted_slc_w_T = torch.zeros_like(m)
 39 |         sorted_slc_i_T = torch.zeros_like(m)
 40 |         
 41 |         for i in range(m.shape[0]):
 42 |             sorted_slc_w[i],sorted_slc_i[i] = m[i].sort(dim=1)
 43 |             sorted_slc_w_T[i],sorted_slc_i_T[i] = m[i].T.sort(dim=1)
 44 | 
 45 |         sorted_slc_w = sorted_slc_w.repeat(num_walks,1,1,1)
 46 |         sorted_slc_w_T = sorted_slc_w_T.repeat(num_walks,1,1,1)
 47 |         sorted_slc_i = sorted_slc_i.repeat(num_walks,1,1,1)
 48 |         sorted_slc_i_T = sorted_slc_i_T.repeat(num_walks,1,1,1)
 49 | 
 50 |         test_samples = to_cuda(torch.arange(m.shape[1]))
 51 | 
 52 |         w1,i1 = sorted_slc_w[:,:,test_samples],sorted_slc_i[:,:,test_samples]
 53 | 
 54 |         pw1 = torch.exp(w1[...,-num_top:])
 55 |         nw1 = 1/torch.exp(w1[...,:num_top]) # inverse to select for lower contact frequencies
 56 | 
 57 |         pi1 = i1[...,-num_top:]
 58 |         ni1 = i1[...,:num_top]
 59 | 
 60 |         p_mask = F.one_hot(torch.squeeze(random_sample(pw1,1)[1]),num_classes=pi1.shape[-1])
 61 |         n_mask = F.one_hot(torch.squeeze(random_sample(nw1,1)[1]),num_classes=ni1.shape[-1])
 62 | 
 63 |         pos_selection1 = ((pi1 * p_mask).sum(dim=-1)).long()
 64 |         neg_selection1 = ((ni1 * n_mask).sum(dim=-1)).long()
 65 | 
 66 |         pw2 = torch.gather(sorted_slc_w_T,-2,pos_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1]))
 67 |         pi2 = torch.gather(sorted_slc_i_T,-2,pos_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1]))
 68 | 
 69 |         pw2 = torch.exp(pw2[...,-num_top:])
 70 |         pi2 = pi2[...,-num_top:]
 71 | 
 72 |         nw2 = torch.gather(sorted_slc_w_T,-2,neg_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1]))
 73 |         ni2 = torch.gather(sorted_slc_i_T,-2,neg_selection1[...,None].tile(1,1,1,pos_selection1.shape[-1]))
 74 | 
 75 |         nw2 = torch.exp(nw2[...,-num_top:])
 76 |         ni2 = ni2[...,-num_top:]
 77 | 
 78 |         p_mask = F.one_hot(torch.squeeze(random_sample(pw2,1)[1]),num_classes=pi2.shape[-1])
 79 |         n_mask = F.one_hot(torch.squeeze(random_sample(nw2,1)[1]),num_classes=ni2.shape[-1])
 80 | 
 81 |         pos_selection2 = ((pi2 * p_mask).sum(dim=-1)).long()
 82 |         neg_selection2 = ((ni2 * n_mask).sum(dim=-1)).long()
 83 | 
 84 |         for i in range(num_walks):
 85 |             selections = to_cuda(torch.stack((
 86 |                 pos_selection1[i].flatten() + bpt,
 87 |                 pos_selection2[i].flatten(),
 88 |                 neg_selection1[i].flatten() + bpt,
 89 |                 neg_selection2[i].flatten()
 90 |             )).T.flatten())
 91 |             
 92 |             labels = to_cuda(torch.tensor([1,1,-1,-1]).repeat(len(m) * len(test_samples)))
 93 | 
 94 |             interactions = torch.stack((
 95 |                 test_samples.repeat_interleave(4 * len(m)) + chrm_offset,
 96 |                 selections,
 97 |                 labels,
 98 |             )).T
 99 | 
100 |             all_samples.append(interactions)
101 | 
102 |         all_samples = torch.unique(torch.cat(all_samples),dim=0).cpu().numpy().astype(np.int16)
103 |         all_cell_chrom_samples.append(all_samples)
104 | 
105 |     return all_cell_chrom_samples


--------------------------------------------------------------------------------
/sample_configs/config_wtc.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "schic_directory" : "/mnt/e/data/wtc",
  3 |     "label_info": null,
  4 |     "data_directory" : "/mnt/e/data/scghost_wtc_output",
  5 |     "chromosomes" : {
  6 |         "1" : {
  7 |             "adj" : "chr1_sparse_adj.npy",
  8 |             "imputed" : "chr1_exp1_nbr_5_impute.hdf5",
  9 |             "integer" : 1
 10 |         },
 11 |         "2" : {
 12 |             "adj" : "chr2_sparse_adj.npy",
 13 |             "imputed" : "chr2_exp1_nbr_5_impute.hdf5",
 14 |             "integer" : 2
 15 |         },
 16 |         "3" : {
 17 |             "adj" : "chr3_sparse_adj.npy",
 18 |             "imputed" : "chr3_exp1_nbr_5_impute.hdf5",
 19 |             "integer" : 3
 20 |         },
 21 |         "4" : {
 22 |             "adj" : "chr4_sparse_adj.npy",
 23 |             "imputed" : "chr4_exp1_nbr_5_impute.hdf5",
 24 |             "integer" : 4
 25 |         },
 26 |         "5" : {
 27 |             "adj" : "chr5_sparse_adj.npy",
 28 |             "imputed" : "chr5_exp1_nbr_5_impute.hdf5",
 29 |             "integer" : 5
 30 |         },
 31 |         "6" : {
 32 |             "adj" : "chr6_sparse_adj.npy",
 33 |             "imputed" : "chr6_exp1_nbr_5_impute.hdf5",
 34 |             "integer" : 6
 35 |         },
 36 |         "7" : {
 37 |             "adj" : "chr7_sparse_adj.npy",
 38 |             "imputed" : "chr7_exp1_nbr_5_impute.hdf5",
 39 |             "integer" : 7
 40 |         },
 41 |         "8" : {
 42 |             "adj" : "chr8_sparse_adj.npy",
 43 |             "imputed" : "chr8_exp1_nbr_5_impute.hdf5",
 44 |             "integer" : 8
 45 |         },
 46 |         "9" : {
 47 |             "adj" : "chr9_sparse_adj.npy",
 48 |             "imputed" : "chr9_exp1_nbr_5_impute.hdf5",
 49 |             "integer" : 9
 50 |         },
 51 |         "10" : {
 52 |             "adj" : "chr10_sparse_adj.npy",
 53 |             "imputed" : "chr10_exp1_nbr_5_impute.hdf5",
 54 |             "integer" : 10
 55 |         },
 56 |         "11" : {
 57 |             "adj" : "chr11_sparse_adj.npy",
 58 |             "imputed" : "chr11_exp1_nbr_5_impute.hdf5",
 59 |             "integer" : 11
 60 |         },
 61 |         "12" : {
 62 |             "adj" : "chr12_sparse_adj.npy",
 63 |             "imputed" : "chr12_exp1_nbr_5_impute.hdf5",
 64 |             "integer" : 12
 65 |         },
 66 |         "13" : {
 67 |             "adj" : "chr13_sparse_adj.npy",
 68 |             "imputed" : "chr13_exp1_nbr_5_impute.hdf5",
 69 |             "integer" : 13
 70 |         },
 71 |         "14" : {
 72 |             "adj" : "chr14_sparse_adj.npy",
 73 |             "imputed" : "chr14_exp1_nbr_5_impute.hdf5",
 74 |             "integer" : 14
 75 |         },
 76 |         "15" : {
 77 |             "adj" : "chr15_sparse_adj.npy",
 78 |             "imputed" : "chr15_exp1_nbr_5_impute.hdf5",
 79 |             "integer" : 15
 80 |         },
 81 |         "16" : {
 82 |             "adj" : "chr16_sparse_adj.npy",
 83 |             "imputed" : "chr16_exp1_nbr_5_impute.hdf5",
 84 |             "integer" : 16
 85 |         },
 86 |         "17" : {
 87 |             "adj" : "chr17_sparse_adj.npy",
 88 |             "imputed" : "chr17_exp1_nbr_5_impute.hdf5",
 89 |             "integer" : 17
 90 |         },
 91 |         "18" : {
 92 |             "adj" : "chr18_sparse_adj.npy",
 93 |             "imputed" : "chr18_exp1_nbr_5_impute.hdf5",
 94 |             "integer" : 18
 95 |         },
 96 |         "19" : {
 97 |             "adj" : "chr19_sparse_adj.npy",
 98 |             "imputed" : "chr19_exp1_nbr_5_impute.hdf5",
 99 |             "integer" : 19
100 |         },
101 |         "20" : {
102 |             "adj" : "chr20_sparse_adj.npy",
103 |             "imputed" : "chr20_exp1_nbr_5_impute.hdf5",
104 |             "integer" : 20
105 |         },
106 |         "21" : {
107 |             "adj" : "chr21_sparse_adj.npy",
108 |             "imputed" : "chr21_exp1_nbr_5_impute.hdf5",
109 |             "integer" : 21
110 |         },
111 |         "22" : {
112 |             "adj" : "chr22_sparse_adj.npy",
113 |             "imputed" : "chr22_exp1_nbr_5_impute.hdf5",
114 |             "integer" : 22
115 |         }
116 |     },
117 |     "chrom_sizes" : "data/hg38.chrom.sizes",
118 |     "chrom_indices" : null,
119 |     "embeddings_path" : "/mnt/e/data/wtc/embed/exp1_0_origin.npy",
120 |     "higashi_scab_path" : "/mnt/e/data/wtc/scAB_with_nbr.hdf5",
121 |     "cell_type" : null,
122 |     "random_walk" : {
123 |         "num_walks" : 25,
124 |         "ignore_top" : 0.02,
125 |         "top_percentile" : 0.25
126 |     },
127 |     "eps": 1e-8,
128 |     "num_clusters" : 5,
129 |     "batch_size" : 16,
130 |     "epochs" : 5,
131 |     "resolution" : 500000,
132 |     "neighbor_contacts" : true,
133 |     "nearest_neighbor_override" : null,
134 |     "gpu_uniques" : true,
135 |     "cluster_gpu_caching" : true,
136 |     "kmeans_init" : 1
137 | }


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "schic_directory" : "/directory/of/higashi/imputed/maps",
  3 |     "label_info": {
  4 |         "path":"/path/to/label_info.pickle",
  5 |         "cell_type_key": "cell type key in label_info.pickle"
  6 |     },
  7 |     "data_directory" : "/directory/to/save/scghost/outputs/",
  8 |     "chromosomes" : {
  9 |         "1" : {
 10 |             "adj" : "chr1_sparse_adj.npy",
 11 |             "imputed" : "chr1_exp1_nbr_5_impute.hdf5",
 12 |             "integer" : 1
 13 |         },
 14 |         "2" : {
 15 |             "adj" : "chr2_sparse_adj.npy",
 16 |             "imputed" : "chr2_exp1_nbr_5_impute.hdf5",
 17 |             "integer" : 2
 18 |         },
 19 |         "3" : {
 20 |             "adj" : "chr3_sparse_adj.npy",
 21 |             "imputed" : "chr3_exp1_nbr_5_impute.hdf5",
 22 |             "integer" : 3
 23 |         },
 24 |         "4" : {
 25 |             "adj" : "chr4_sparse_adj.npy",
 26 |             "imputed" : "chr4_exp1_nbr_5_impute.hdf5",
 27 |             "integer" : 4
 28 |         },
 29 |         "5" : {
 30 |             "adj" : "chr5_sparse_adj.npy",
 31 |             "imputed" : "chr5_exp1_nbr_5_impute.hdf5",
 32 |             "integer" : 5
 33 |         },
 34 |         "6" : {
 35 |             "adj" : "chr6_sparse_adj.npy",
 36 |             "imputed" : "chr6_exp1_nbr_5_impute.hdf5",
 37 |             "integer" : 6
 38 |         },
 39 |         "7" : {
 40 |             "adj" : "chr7_sparse_adj.npy",
 41 |             "imputed" : "chr7_exp1_nbr_5_impute.hdf5",
 42 |             "integer" : 7
 43 |         },
 44 |         "8" : {
 45 |             "adj" : "chr8_sparse_adj.npy",
 46 |             "imputed" : "chr8_exp1_nbr_5_impute.hdf5",
 47 |             "integer" : 8
 48 |         },
 49 |         "9" : {
 50 |             "adj" : "chr9_sparse_adj.npy",
 51 |             "imputed" : "chr9_exp1_nbr_5_impute.hdf5",
 52 |             "integer" : 9
 53 |         },
 54 |         "10" : {
 55 |             "adj" : "chr10_sparse_adj.npy",
 56 |             "imputed" : "chr10_exp1_nbr_5_impute.hdf5",
 57 |             "integer" : 10
 58 |         },
 59 |         "11" : {
 60 |             "adj" : "chr11_sparse_adj.npy",
 61 |             "imputed" : "chr11_exp1_nbr_5_impute.hdf5",
 62 |             "integer" : 11
 63 |         },
 64 |         "12" : {
 65 |             "adj" : "chr12_sparse_adj.npy",
 66 |             "imputed" : "chr12_exp1_nbr_5_impute.hdf5",
 67 |             "integer" : 12
 68 |         },
 69 |         "13" : {
 70 |             "adj" : "chr13_sparse_adj.npy",
 71 |             "imputed" : "chr13_exp1_nbr_5_impute.hdf5",
 72 |             "integer" : 13
 73 |         },
 74 |         "14" : {
 75 |             "adj" : "chr14_sparse_adj.npy",
 76 |             "imputed" : "chr14_exp1_nbr_5_impute.hdf5",
 77 |             "integer" : 14
 78 |         },
 79 |         "15" : {
 80 |             "adj" : "chr15_sparse_adj.npy",
 81 |             "imputed" : "chr15_exp1_nbr_5_impute.hdf5",
 82 |             "integer" : 15
 83 |         },
 84 |         "16" : {
 85 |             "adj" : "chr16_sparse_adj.npy",
 86 |             "imputed" : "chr16_exp1_nbr_5_impute.hdf5",
 87 |             "integer" : 16
 88 |         },
 89 |         "17" : {
 90 |             "adj" : "chr17_sparse_adj.npy",
 91 |             "imputed" : "chr17_exp1_nbr_5_impute.hdf5",
 92 |             "integer" : 17
 93 |         },
 94 |         "18" : {
 95 |             "adj" : "chr18_sparse_adj.npy",
 96 |             "imputed" : "chr18_exp1_nbr_5_impute.hdf5",
 97 |             "integer" : 18
 98 |         },
 99 |         "19" : {
100 |             "adj" : "chr19_sparse_adj.npy",
101 |             "imputed" : "chr19_exp1_nbr_5_impute.hdf5",
102 |             "integer" : 19
103 |         },
104 |         "20" : {
105 |             "adj" : "chr20_sparse_adj.npy",
106 |             "imputed" : "chr20_exp1_nbr_5_impute.hdf5",
107 |             "integer" : 20
108 |         },
109 |         "21" : {
110 |             "adj" : "chr21_sparse_adj.npy",
111 |             "imputed" : "chr21_exp1_nbr_5_impute.hdf5",
112 |             "integer" : 21
113 |         },
114 |         "22" : {
115 |             "adj" : "chr22_sparse_adj.npy",
116 |             "imputed" : "chr22_exp1_nbr_5_impute.hdf5",
117 |             "integer" : 22
118 |         }
119 |     },
120 |     "chrom_sizes" : "data/hg38.chrom.sizes",
121 |     "chrom_indices" : null,
122 |     "embeddings_path" : "/path/to/exp1_0_origin.npy",
123 |     "higashi_scab_path" : "/path/to/higashi/scAB.hdf5",
124 |     "cell_type" : null,
125 |     "random_walk" : {
126 |         "num_walks" : 50,
127 |         "ignore_top" : 0.02,
128 |         "top_percentile" : 0.25
129 |     },
130 |     "eps": 1e-8,
131 |     "num_clusters" : 5,
132 |     "batch_size" : 16,
133 |     "epochs" : 5,
134 |     "resolution" : 500000,
135 |     "neighbor_contacts" : false,
136 |     "nearest_neighbor_override" : null,
137 |     "gpu_uniques" : true,
138 |     "cluster_gpu_caching" : true,
139 |     "kmeans_init" : 1
140 | }


--------------------------------------------------------------------------------
/modules/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from cProfile import run
  2 | import numpy as np
  3 | import pickle
  4 | import gc
  5 | import os
  6 | import h5py
  7 | 
  8 | from utilities.parsers import parse_chromosomes, parse_cell_types
  9 | from utilities.chrom_sizes import chrom_sizes
 10 | from scipy.sparse import coo_matrix
 11 | from tqdm import trange
 12 | 
 13 | def compute_chrom_indices(runtime_args,save=True):
 14 | 
 15 |     if runtime_args['chrom_indices'] is not None:
 16 |         return
 17 | 
 18 |     chromosomes = parse_chromosomes(runtime_args)
 19 |     cell_type_index = parse_cell_types(runtime_args)
 20 | 
 21 |     chrom_indices = {}
 22 | 
 23 |     for n in trange(len(chromosomes)):
 24 | 
 25 |         chrom = chromosomes[n]
 26 |         adj_path = runtime_args['chromosomes'][chrom]['adj']
 27 | 
 28 |         sparse_M = np.load(os.path.join(runtime_args['schic_directory'],adj_path),allow_pickle=True)
 29 |         sparse_M = sparse_M[cell_type_index] if cell_type_index is not None else sparse_M
 30 | 
 31 |         M = sparse_M.sum(axis=0).toarray()
 32 | 
 33 |         nongap = np.where(np.sum(M > 0, axis=-1, keepdims=False) >= (0.1 * M.shape[0]))[0]
 34 |         
 35 |         chrom_indices[chrom] = nongap
 36 |         
 37 |     gc.collect()
 38 | 
 39 |     data_dir = runtime_args['data_directory']
 40 | 
 41 |     if save:
 42 |         pickle.dump(chrom_indices,open(
 43 |                 os.path.join(data_dir,'chrom_indices.pkl'),'wb'
 44 |         ))
 45 | 
 46 |     return chrom_indices
 47 | 
 48 | def extract_OEMs(fname,cell_type_index,chrom_indices,num_cells,chrom_num,chrom_start_end,save_path=None,eps=1e-8):
 49 |     f = h5py.File(fname)
 50 |     
 51 |     chrom_size = chrom_start_end[chrom_num-1,1] - chrom_start_end[chrom_num-1,0]
 52 |     coords = np.array(f['coordinates'])
 53 | 
 54 |     if cell_type_index is None:
 55 |         cti = []
 56 |         for i in range(len(f)):
 57 |             if 'cell_%d' % i in f:
 58 |                 cti.append(i)
 59 |         
 60 |         cell_type_index = np.array(cti)
 61 | 
 62 |     num_cells = len(cell_type_index) if num_cells is None else np.min([num_cells,len(cell_type_index)])
 63 |     cells_data = np.array([np.array(f['cell_%d' % cell_type_index[i]]) for i in range(num_cells)])
 64 | 
 65 |     OEMs = []
 66 |     Ms = []
 67 | 
 68 |     for cell_num in trange(num_cells):
 69 |         M = coo_matrix((cells_data[cell_num],(coords[:,0],coords[:,1])),shape=(chrom_size,chrom_size)).toarray()
 70 |         M += M.T
 71 | 
 72 |         # construct expected matrix
 73 |         E = np.zeros_like(M)
 74 |         l = len(M)
 75 | 
 76 |         for i in range(M.shape[0]):
 77 |             contacts = np.diag(M,i)
 78 |             expected = contacts.sum() / (l-i)
 79 |             # expected = np.median(contacts)
 80 |             x_diag,y_diag = np.diag_indices(M.shape[0]-i)
 81 |             x,y = x_diag,y_diag+i
 82 |             E[x,y] = expected
 83 |             
 84 |         E += E.T
 85 |         E = np.nan_to_num(E) + eps
 86 | 
 87 |         OE = M / E
 88 |         OE = OE[chrom_indices][:,chrom_indices]
 89 |         OE[OE == 0] = 1
 90 |         OE = np.log(OE)
 91 |         Ms.append(M[chrom_indices][:,chrom_indices])
 92 |         OEMs.append(OE)
 93 | 
 94 |     OEMs = np.array(OEMs)
 95 |     Ms = np.array(Ms)
 96 | 
 97 |     # print(OEMs.shape)
 98 |     if save_path is None:
 99 |         return OEMs#, Ms
100 |     else:
101 |         # np.savez_compressed(save_path,oe=OEMs,observed=Ms)
102 |         np.save(save_path,OEMs)
103 |         # np.save(save_path+'_observed',Ms)
104 | 
105 | def compute_observed_over_expected(runtime_args):
106 |     
107 |     chrom_start_end = np.load(os.path.join(runtime_args['schic_directory'],'chrom_start_end.npy'))
108 |     cell_type = runtime_args['cell_type']
109 |     chrom_indices = pickle.load(open(os.path.join(runtime_args['data_directory'],'chrom_indices.pkl'),'rb'))
110 |     chromosomes = parse_chromosomes(runtime_args)
111 | 
112 |     for n in range(len(chromosomes)):
113 |         
114 |         chrom_num = chromosomes[n]
115 | 
116 |         if runtime_args['chromosomes'][chrom_num]['matrix'] is not None:
117 |             continue
118 |         
119 |         impute_path = runtime_args['chromosomes'][chrom_num]['imputed']
120 | 
121 |         cell_type_index = parse_cell_types(runtime_args)
122 | 
123 |         extract_OEMs(
124 |             os.path.join(runtime_args['schic_directory'],impute_path),
125 |             cell_type_index,
126 |             chrom_indices[chrom_num],
127 |             None,
128 |             runtime_args['chromosomes'][chrom_num]['integer'],
129 |             chrom_start_end,
130 |             save_path=os.path.join(runtime_args['data_directory'],'{0}_oe'.format(chrom_num)),
131 |             eps=runtime_args['eps']
132 |         )
133 | 
134 |         print('{0} complete'.format(chrom_num))
135 |         gc.collect()


--------------------------------------------------------------------------------
/sample_configs/config_GM12878.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "schic_directory" : "/mnt/e/data/4dn_orig",
  3 |     "label_info": {
  4 |         "path":"/mnt/e/data/4dn_scihic/label_info_4DN_sciHiC.pickle",
  5 |         "cell_type_key": "cell type"
  6 |     },
  7 |     "data_directory" : "/mnt/e/data/scghost_GM12878_output",
  8 |     "chromosomes" : {
  9 |         "1" : {
 10 |             "adj" : "chr1_sparse_adj.npy",
 11 |             "imputed" : "chr1_exp1_nbr_5_impute.hdf5",
 12 |             "integer" : 1
 13 |         },
 14 |         "2" : {
 15 |             "adj" : "chr2_sparse_adj.npy",
 16 |             "imputed" : "chr2_exp1_nbr_5_impute.hdf5",
 17 |             "integer" : 2
 18 |         },
 19 |         "3" : {
 20 |             "adj" : "chr3_sparse_adj.npy",
 21 |             "imputed" : "chr3_exp1_nbr_5_impute.hdf5",
 22 |             "integer" : 3
 23 |         },
 24 |         "4" : {
 25 |             "adj" : "chr4_sparse_adj.npy",
 26 |             "imputed" : "chr4_exp1_nbr_5_impute.hdf5",
 27 |             "integer" : 4
 28 |         },
 29 |         "5" : {
 30 |             "adj" : "chr5_sparse_adj.npy",
 31 |             "imputed" : "chr5_exp1_nbr_5_impute.hdf5",
 32 |             "integer" : 5
 33 |         },
 34 |         "6" : {
 35 |             "adj" : "chr6_sparse_adj.npy",
 36 |             "imputed" : "chr6_exp1_nbr_5_impute.hdf5",
 37 |             "integer" : 6
 38 |         },
 39 |         "7" : {
 40 |             "adj" : "chr7_sparse_adj.npy",
 41 |             "imputed" : "chr7_exp1_nbr_5_impute.hdf5",
 42 |             "integer" : 7
 43 |         },
 44 |         "8" : {
 45 |             "adj" : "chr8_sparse_adj.npy",
 46 |             "imputed" : "chr8_exp1_nbr_5_impute.hdf5",
 47 |             "integer" : 8
 48 |         },
 49 |         "9" : {
 50 |             "adj" : "chr9_sparse_adj.npy",
 51 |             "imputed" : "chr9_exp1_nbr_5_impute.hdf5",
 52 |             "integer" : 9
 53 |         },
 54 |         "10" : {
 55 |             "adj" : "chr10_sparse_adj.npy",
 56 |             "imputed" : "chr10_exp1_nbr_5_impute.hdf5",
 57 |             "integer" : 10
 58 |         },
 59 |         "11" : {
 60 |             "adj" : "chr11_sparse_adj.npy",
 61 |             "imputed" : "chr11_exp1_nbr_5_impute.hdf5",
 62 |             "integer" : 11
 63 |         },
 64 |         "12" : {
 65 |             "adj" : "chr12_sparse_adj.npy",
 66 |             "imputed" : "chr12_exp1_nbr_5_impute.hdf5",
 67 |             "integer" : 12
 68 |         },
 69 |         "13" : {
 70 |             "adj" : "chr13_sparse_adj.npy",
 71 |             "imputed" : "chr13_exp1_nbr_5_impute.hdf5",
 72 |             "integer" : 13
 73 |         },
 74 |         "14" : {
 75 |             "adj" : "chr14_sparse_adj.npy",
 76 |             "imputed" : "chr14_exp1_nbr_5_impute.hdf5",
 77 |             "integer" : 14
 78 |         },
 79 |         "15" : {
 80 |             "adj" : "chr15_sparse_adj.npy",
 81 |             "imputed" : "chr15_exp1_nbr_5_impute.hdf5",
 82 |             "integer" : 15
 83 |         },
 84 |         "16" : {
 85 |             "adj" : "chr16_sparse_adj.npy",
 86 |             "imputed" : "chr16_exp1_nbr_5_impute.hdf5",
 87 |             "integer" : 16
 88 |         },
 89 |         "17" : {
 90 |             "adj" : "chr17_sparse_adj.npy",
 91 |             "imputed" : "chr17_exp1_nbr_5_impute.hdf5",
 92 |             "integer" : 17
 93 |         },
 94 |         "18" : {
 95 |             "adj" : "chr18_sparse_adj.npy",
 96 |             "imputed" : "chr18_exp1_nbr_5_impute.hdf5",
 97 |             "integer" : 18
 98 |         },
 99 |         "19" : {
100 |             "adj" : "chr19_sparse_adj.npy",
101 |             "imputed" : "chr19_exp1_nbr_5_impute.hdf5",
102 |             "integer" : 19
103 |         },
104 |         "20" : {
105 |             "adj" : "chr20_sparse_adj.npy",
106 |             "imputed" : "chr20_exp1_nbr_5_impute.hdf5",
107 |             "integer" : 20
108 |         },
109 |         "21" : {
110 |             "adj" : "chr21_sparse_adj.npy",
111 |             "imputed" : "chr21_exp1_nbr_5_impute.hdf5",
112 |             "integer" : 21
113 |         },
114 |         "22" : {
115 |             "adj" : "chr22_sparse_adj.npy",
116 |             "imputed" : "chr22_exp1_nbr_5_impute.hdf5",
117 |             "integer" : 22
118 |         }
119 |     },
120 |     "chrom_sizes" : "data/hg38.chrom.sizes",
121 |     "chrom_indices" : "/mnt/e/data/scghost_GM12878_output/chrom_indices.pkl",
122 |     "embeddings_path" : "/mnt/e/data/4dn_scihic/exp1_0_origin.npy",
123 |     "higashi_scab_path" : "/mnt/e/data/4dn_scihic/scAB.hdf5",
124 |     "cell_type" : "GM12878",
125 |     "random_walk" : {
126 |         "num_walks" : 25,
127 |         "ignore_top" : 0.02,
128 |         "top_percentile" : 0.25
129 |     },
130 |     "eps": 1e-8,
131 |     "num_clusters" : 5,
132 |     "batch_size" : 16,
133 |     "epochs" : 5,
134 |     "resolution" : 500000,
135 |     "neighbor_contacts" : false,
136 |     "nearest_neighbor_override" : null,
137 |     "gpu_uniques" : true,
138 |     "cluster_gpu_caching" : true,
139 |     "kmeans_init" : 1
140 | }


--------------------------------------------------------------------------------
/modules/embedding.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import pickle
  5 | import gc
  6 | 
  7 | from tqdm.auto import trange
  8 | from modules.preprocessing import parse_chromosomes
  9 | from torch import nn
 10 | from tqdm import trange
 11 | from torch.nn import functional as F
 12 | 
 13 | num_cells = 500
 14 | 
 15 | def to_cuda(x):
 16 |     if torch.cuda.is_available():
 17 |         return x.cuda()
 18 | 
 19 |     return x
 20 | 
 21 | 
 22 | class hubs(nn.Module):
 23 |     def __init__(self, N, num_cells, hidden_dim=128):
 24 |         super(hubs, self).__init__()
 25 |         self.N = N
 26 |         self.num_cells = num_cells
 27 |         self.hidden_dim = hidden_dim
 28 | 
 29 |         self.embedding = nn.Embedding(self.N * self.num_cells,
 30 |                                       self.hidden_dim, sparse=True, max_norm=1)
 31 | 
 32 |         to_cuda(self)
 33 | 
 34 |     def to_one_hot(self, x):
 35 |         return F.one_hot(x, num_classes=self.N * self.num_cells).float()
 36 | 
 37 |     def embed(self, x):
 38 |         return self.embedding(x)
 39 | 
 40 | 
 41 | def prep_pairs_labels(all_pairs, all_labels, gap, indices=None, thresh=None):
 42 |     concatenated_pairs = []
 43 |     concatenated_labels = []
 44 | 
 45 |     lengths = [len(xx) for xx in all_pairs]
 46 |     clip_len = np.min(lengths) if thresh is None else thresh
 47 |     iterable = range(len(all_pairs)) if indices is None else indices
 48 | 
 49 |     kept_cells = []
 50 |     n = 0
 51 | 
 52 |     # Random permutation within each cell
 53 |     for i in iterable:
 54 |         cell_pairs = all_pairs[i]
 55 |         
 56 |         if thresh is not None and len(cell_pairs) < thresh:
 57 |             continue
 58 | 
 59 |         id_ = torch.randperm(len(cell_pairs))[:clip_len]
 60 |         concatenated_pairs.append(cell_pairs[id_] + n * gap)
 61 |         concatenated_labels.append(all_labels[i][id_])
 62 | 
 63 |         n += 1
 64 |         kept_cells.append(i)
 65 |         
 66 |     # Stack instead of concat, shape of (#cell, #pairs, 2)
 67 |     concatenated_pairs = torch.stack(concatenated_pairs, dim=0)
 68 |     concatenated_labels = torch.stack(concatenated_labels, dim=0)
 69 | 
 70 |     return (concatenated_pairs, concatenated_labels) if thresh is None else (concatenated_pairs,concatenated_labels,np.array(kept_cells))
 71 | 
 72 | 
 73 | def embed_single_cells_unified(all_continuous_pairs, all_continuous_labels, OEMs, embedding_file, epochs=1,
 74 |                                cell_nums=None, batch_size=64, verbose=False, prepped=False):
 75 |     cell_nums = np.arange(len(all_continuous_pairs)) if cell_nums is None else cell_nums
 76 | 
 77 |     model = hubs(len(OEMs[0]), len(cell_nums), hidden_dim=128)
 78 |     bs = batch_size
 79 | 
 80 |     all_Es = []
 81 |     optimizer = torch.optim.SparseAdam(model.parameters())
 82 | 
 83 |     if not prepped:
 84 |         all_continuous_pairs, all_continuous_labels = prep_pairs_labels(all_continuous_pairs,
 85 |                                                                       all_continuous_labels,
 86 |                                                                       OEMs[0].shape[0],
 87 |                                                                       indices=cell_nums)
 88 |     for epoch in range(epochs):
 89 | 
 90 |         shuffle_id = torch.randperm(all_continuous_pairs.shape[1])
 91 | 
 92 |         N_pairs = all_continuous_pairs.shape[-2]
 93 |         rloss = 0
 94 |         rsamples = 0
 95 |         bar = trange(0, N_pairs, bs) if verbose else range(0, N_pairs, bs)
 96 | 
 97 |         for i in bar:
 98 |             # During training, sample a batch of pairs from each cell (can be small 16 yields good results)
 99 |             # You can also sample a batch of cells as well, but that needs to be sth large, like 2k cells etc.
100 |             x = model.embed(to_cuda(all_continuous_pairs[:, shuffle_id[i:i + bs], :]))
101 |             x1 = x[:, :, 0]
102 |             x2 = x[:, :, 1]
103 | 
104 |             y = to_cuda(all_continuous_labels[:, shuffle_id[i:i + bs]])
105 |             sim = F.cosine_similarity(x1, x2, dim=-1)
106 |             optimizer.zero_grad()
107 |             loss = F.mse_loss(sim, y)
108 |             loss.backward()
109 |             optimizer.step()
110 | 
111 |             blen = x.shape[1]
112 |             rloss += float(loss) * blen
113 |             rsamples += blen
114 | 
115 |         print('Epoch %d: %d/%d -- %.6f loss' % (epoch, rsamples, N_pairs, rloss / rsamples), end='\r')
116 | 
117 |         print()
118 | 
119 |     num_loci = model.N * model.num_cells
120 |     bar = trange(0, num_loci, bs) if verbose else range(0, num_loci, bs)
121 | 
122 |     for i in bar:
123 |         end = np.min([i + bs, num_loci])
124 | 
125 |         x = model.embed(to_cuda(torch.arange(i, end))).to_dense().detach().cpu().numpy()
126 |         all_Es.append(x)
127 | 
128 |     all_Es = np.vstack(all_Es)  # final shape - (num_cells * num_chrom_loci, hidden_dim)
129 | 
130 |     all_Es = all_Es.reshape((model.num_cells, model.N, model.hidden_dim))
131 | 
132 |     np.save(embedding_file, all_Es)


--------------------------------------------------------------------------------
/sample_configs/config_pfc.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "schic_directory" : "/mnt/e/data/pfc",
  3 |     "label_info": {
  4 |         "path":"/mnt/e/data/pfc/label_info.pickle",
  5 |         "cell_type_key": "cluster label"
  6 |     },
  7 |     "data_directory" : "/mnt/e/data/scghost_pfc_output",
  8 |     "chromosomes" : {
  9 |         "1" : {
 10 |             "adj" : "chr1_sparse_adj.npy",
 11 |             "imputed" : "chr1_exp5_zinb_nbr_5_impute.hdf5",
 12 |             "integer" : 1
 13 |         },
 14 |         "2" : {
 15 |             "adj" : "chr2_sparse_adj.npy",
 16 |             "imputed" : "chr2_exp5_zinb_nbr_5_impute.hdf5",
 17 |             "integer" : 2
 18 |         },
 19 |         "3" : {
 20 |             "adj" : "chr3_sparse_adj.npy",
 21 |             "imputed" : "chr3_exp5_zinb_nbr_5_impute.hdf5",
 22 |             "integer" : 3
 23 |         },
 24 |         "4" : {
 25 |             "adj" : "chr4_sparse_adj.npy",
 26 |             "imputed" : "chr4_exp5_zinb_nbr_5_impute.hdf5",
 27 |             "integer" : 4
 28 |         },
 29 |         "5" : {
 30 |             "adj" : "chr5_sparse_adj.npy",
 31 |             "imputed" : "chr5_exp5_zinb_nbr_5_impute.hdf5",
 32 |             "integer" : 5
 33 |         },
 34 |         "6" : {
 35 |             "adj" : "chr6_sparse_adj.npy",
 36 |             "imputed" : "chr6_exp5_zinb_nbr_5_impute.hdf5",
 37 |             "integer" : 6
 38 |         },
 39 |         "7" : {
 40 |             "adj" : "chr7_sparse_adj.npy",
 41 |             "imputed" : "chr7_exp5_zinb_nbr_5_impute.hdf5",
 42 |             "integer" : 7
 43 |         },
 44 |         "8" : {
 45 |             "adj" : "chr8_sparse_adj.npy",
 46 |             "imputed" : "chr8_exp5_zinb_nbr_5_impute.hdf5",
 47 |             "integer" : 8
 48 |         },
 49 |         "9" : {
 50 |             "adj" : "chr9_sparse_adj.npy",
 51 |             "imputed" : "chr9_exp5_zinb_nbr_5_impute.hdf5",
 52 |             "integer" : 9
 53 |         },
 54 |         "10" : {
 55 |             "adj" : "chr10_sparse_adj.npy",
 56 |             "imputed" : "chr10_exp5_zinb_nbr_5_impute.hdf5",
 57 |             "integer" : 10
 58 |         },
 59 |         "11" : {
 60 |             "adj" : "chr11_sparse_adj.npy",
 61 |             "imputed" : "chr11_exp5_zinb_nbr_5_impute.hdf5",
 62 |             "integer" : 11
 63 |         },
 64 |         "12" : {
 65 |             "adj" : "chr12_sparse_adj.npy",
 66 |             "imputed" : "chr12_exp5_zinb_nbr_5_impute.hdf5",
 67 |             "integer" : 12
 68 |         },
 69 |         "13" : {
 70 |             "adj" : "chr13_sparse_adj.npy",
 71 |             "imputed" : "chr13_exp5_zinb_nbr_5_impute.hdf5",
 72 |             "integer" : 13
 73 |         },
 74 |         "14" : {
 75 |             "adj" : "chr14_sparse_adj.npy",
 76 |             "imputed" : "chr14_exp5_zinb_nbr_5_impute.hdf5",
 77 |             "integer" : 14
 78 |         },
 79 |         "15" : {
 80 |             "adj" : "chr15_sparse_adj.npy",
 81 |             "imputed" : "chr15_exp5_zinb_nbr_5_impute.hdf5",
 82 |             "integer" : 15
 83 |         },
 84 |         "16" : {
 85 |             "adj" : "chr16_sparse_adj.npy",
 86 |             "imputed" : "chr16_exp5_zinb_nbr_5_impute.hdf5",
 87 |             "integer" : 16
 88 |         },
 89 |         "17" : {
 90 |             "adj" : "chr17_sparse_adj.npy",
 91 |             "imputed" : "chr17_exp5_zinb_nbr_5_impute.hdf5",
 92 |             "integer" : 17
 93 |         },
 94 |         "18" : {
 95 |             "adj" : "chr18_sparse_adj.npy",
 96 |             "imputed" : "chr18_exp5_zinb_nbr_5_impute.hdf5",
 97 |             "integer" : 18
 98 |         },
 99 |         "19" : {
100 |             "adj" : "chr19_sparse_adj.npy",
101 |             "imputed" : "chr19_exp5_zinb_nbr_5_impute.hdf5",
102 |             "integer" : 19
103 |         },
104 |         "20" : {
105 |             "adj" : "chr20_sparse_adj.npy",
106 |             "imputed" : "chr20_exp5_zinb_nbr_5_impute.hdf5",
107 |             "integer" : 20
108 |         },
109 |         "21" : {
110 |             "adj" : "chr21_sparse_adj.npy",
111 |             "imputed" : "chr21_exp5_zinb_nbr_5_impute.hdf5",
112 |             "integer" : 21
113 |         },
114 |         "22" : {
115 |             "adj" : "chr22_sparse_adj.npy",
116 |             "imputed" : "chr22_exp5_zinb_nbr_5_impute.hdf5",
117 |             "integer" : 22
118 |         }
119 |     },
120 |     "chrom_sizes" : "data/hg19.chrom.sizes",
121 |     "chrom_indices" : "/mnt/e/data/scghost_pfc_output/chrom_indices.pkl",
122 |     "embeddings_path" : "/mnt/e/data/pfc/embed/exp5_zinb_0_origin.npy",
123 |     "higashi_scab_path" : "/mnt/e/data/pfc/scAB.hdf5",
124 |     "cell_type" : null,
125 |     "random_walk" : {
126 |         "num_walks" : 25,
127 |         "ignore_top" : 0.02,
128 |         "top_percentile" : 0.25
129 |     },
130 |     "eps": 1e-8,
131 |     "num_clusters" : 5,
132 |     "batch_size" : 16,
133 |     "epochs" : 5,
134 |     "resolution" : 500000,
135 |     "neighbor_contacts" : true,
136 |     "nearest_neighbor_override" : null,
137 |     "gpu_uniques" : true,
138 |     "cluster_gpu_caching" : true,
139 |     "kmeans_init" : 1
140 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.8.6
  2 | aiosignal==1.3.1
  3 | anyio==4.0.0
  4 | argon2-cffi==23.1.0
  5 | argon2-cffi-bindings==21.2.0
  6 | arrow==1.3.0
  7 | async-timeout==4.0.3
  8 | attrs==23.1.0
  9 | beautifulsoup4==4.12.2
 10 | bleach==6.1.0
 11 | bokeh==3.3.0
 12 | Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work
 13 | cachetools==5.3.2
 14 | certifi==2023.7.22
 15 | cffi @ file:///croot/cffi_1670423208954/work
 16 | charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
 17 | click==8.1.7
 18 | click-plugins==1.1.1
 19 | cligj==0.7.2
 20 | cloudpickle==3.0.0
 21 | colorcet==3.0.1
 22 | contourpy==1.1.1
 23 | cryptography @ file:///croot/cryptography_1694444244250/work
 24 | cubinlinker-cu11==0.3.0.post1
 25 | cucim==23.10.0
 26 | cuda-python==11.8.2
 27 | cudf-cu11==23.10.0
 28 | cugraph-cu11==23.10.0
 29 | cuml-cu11==23.10.0
 30 | cuproj-cu11==23.10.0
 31 | cupy-cuda11x==12.2.0
 32 | cuspatial-cu11==23.10.0
 33 | cuxfilter-cu11==23.10.0
 34 | Cython==3.0.4
 35 | dask==2023.9.2
 36 | dask-cuda==23.10.0
 37 | dask-cudf-cu11==23.10.0
 38 | datashader==0.16.0
 39 | defusedxml==0.7.1
 40 | distributed==2023.9.2
 41 | exceptiongroup==1.1.3
 42 | fastjsonschema==2.18.1
 43 | fastrlock==0.8.2
 44 | filelock @ file:///croot/filelock_1672387128942/work
 45 | fiona==1.9.5
 46 | fqdn==1.5.1
 47 | frozenlist==1.4.0
 48 | fsspec==2023.10.0
 49 | geopandas==0.14.0
 50 | gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
 51 | h5py @ file:///croot/h5py_1691589708553/work
 52 | hmmlearn @ file:///home/conda/feedstock_root/build_artifacts/hmmlearn_1696709150716/work
 53 | holoviews==1.18.0
 54 | idna @ file:///croot/idna_1666125576474/work
 55 | importlib-metadata==6.8.0
 56 | isoduration==20.11.0
 57 | Jinja2 @ file:///croot/jinja2_1666908132255/work
 58 | joblib @ file:///croot/joblib_1685113087166/work
 59 | jsonpointer==2.4
 60 | jsonschema==4.19.1
 61 | jsonschema-specifications==2023.7.1
 62 | jupyter-events==0.8.0
 63 | jupyter_client==8.5.0
 64 | jupyter_core==5.4.0
 65 | jupyter_server==2.9.1
 66 | jupyter_server_proxy==4.1.0
 67 | jupyter_server_terminals==0.4.4
 68 | jupyterlab-pygments==0.2.2
 69 | lazy_loader==0.3
 70 | linkify-it-py==2.0.2
 71 | llvmlite==0.40.1
 72 | locket==1.0.0
 73 | Markdown==3.5
 74 | markdown-it-py==3.0.0
 75 | MarkupSafe @ file:///opt/conda/conda-bld/markupsafe_1654597864307/work
 76 | mdit-py-plugins==0.4.0
 77 | mdurl==0.1.2
 78 | mistune==3.0.2
 79 | mkl-fft @ file:///croot/mkl_fft_1695058164594/work
 80 | mkl-random @ file:///croot/mkl_random_1695059800811/work
 81 | mkl-service==2.4.0
 82 | mpmath @ file:///croot/mpmath_1690848262763/work
 83 | msgpack==1.0.7
 84 | multidict==6.0.4
 85 | multipledispatch==1.0.0
 86 | nbclient==0.8.0
 87 | nbconvert==7.9.2
 88 | nbformat==5.9.2
 89 | networkx @ file:///croot/networkx_1690561992265/work
 90 | numba==0.57.1
 91 | numpy==1.24.3
 92 | nvtx==0.2.8
 93 | opencv-python==4.7.0.72
 94 | overrides==7.4.0
 95 | packaging==23.2
 96 | pandas==1.5.3
 97 | pandocfilters==1.5.0
 98 | panel==1.3.0
 99 | param==2.0.0
100 | partd==1.4.1
101 | Pillow==9.5.0
102 | platformdirs==3.11.0
103 | prometheus-client==0.17.1
104 | protobuf==4.24.4
105 | psutil==5.9.6
106 | ptxcompiler-cu11==0.7.0.post1
107 | ptyprocess==0.7.0
108 | pyarrow==12.0.1
109 | pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
110 | pyct==0.5.0
111 | Pygments==2.16.1
112 | pylibcugraph-cu11==23.10.0
113 | pylibraft-cu11==23.10.0
114 | pynvml==11.4.1
115 | pyOpenSSL @ file:///croot/pyopenssl_1690223430423/work
116 | pyproj==3.6.1
117 | PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work
118 | python-dateutil==2.8.2
119 | python-json-logger==2.0.7
120 | pytz==2023.3
121 | pyviz_comms==3.0.0
122 | PyYAML @ file:///croot/pyyaml_1698096049011/work
123 | pyzmq==25.1.1
124 | raft-dask-cu11==23.10.0
125 | referencing==0.30.2
126 | requests @ file:///croot/requests_1690400202158/work
127 | rfc3339-validator==0.1.4
128 | rfc3986-validator==0.1.1
129 | rmm-cu11==23.10.0
130 | rpds-py==0.10.6
131 | scikit-learn @ file:///croot/scikit-learn_1690978916802/work
132 | scipy @ file:///croot/scipy_1696543286448/work/dist/scipy-1.11.3-cp310-cp310-linux_x86_64.whl#sha256=16a8c87e543daeb96d1310b2283f542fef2de84ee7090f60187eb71f71cb430c
133 | Send2Trash==1.8.2
134 | shapely==2.0.2
135 | simpervisor==1.0.0
136 | six==1.16.0
137 | sniffio==1.3.0
138 | sortedcontainers==2.4.0
139 | soupsieve==2.5
140 | sympy @ file:///croot/sympy_1668202399572/work
141 | tblib==3.0.0
142 | terminado==0.17.1
143 | threadpoolctl==3.2.0
144 | tinycss2==1.2.1
145 | toolz==0.12.0
146 | torch==2.1.0
147 | torchaudio==2.1.0
148 | torchvision==0.16.0
149 | tornado==6.3.3
150 | tqdm==4.65.0
151 | traitlets==5.12.0
152 | treelite==3.9.1
153 | treelite-runtime==3.9.1
154 | triton==2.1.0
155 | types-python-dateutil==2.8.19.14
156 | typing_extensions @ file:///croot/typing_extensions_1690297465030/work
157 | tzdata==2023.3
158 | uc-micro-py==1.0.2
159 | ucx-py-cu11==0.34.0
160 | uri-template==1.3.0
161 | urllib3 @ file:///croot/urllib3_1698257533958/work
162 | webcolors==1.13
163 | webencodings==0.5.1
164 | websocket-client==1.6.4
165 | xarray==2023.10.1
166 | xyzservices==2023.10.1
167 | yarl==1.9.2
168 | zict==3.0.0
169 | zipp==3.17.0
170 | 


--------------------------------------------------------------------------------
/scghost.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pickle
  4 | import torch
  5 | import numpy as np
  6 | import gc
  7 | 
  8 | from utilities.parsers import parse_config, parse_chromosomes, parse_cell_types, parse_nearest_neighbors
  9 | 
 10 | from modules.preprocessing import compute_chrom_indices, extract_OEMs
 11 | from modules.postprocessing import post_process_samples
 12 | from modules.random_walk import sample_chrom
 13 | from modules.embedding import embed_single_cells_unified, prep_pairs_labels
 14 | from modules.clustering import scghost_clustering
 15 | from tqdm import trange
 16 | 
 17 | if __name__ == '__main__':
 18 | 
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('--config', metavar='cfg', type=str,default='./config.json',help='Path to the configuration file')
 21 | 
 22 |     args = parser.parse_args()
 23 | 
 24 |     runtime_args = parse_config(args.config)
 25 | 
 26 |     os.makedirs(runtime_args['data_directory'],exist_ok=True)
 27 | 
 28 |     num_walks = runtime_args['random_walk']['num_walks']
 29 |     neighbor_contacts = True if 'neighbor_contacts' not in runtime_args else runtime_args['neighbor_contacts']
 30 |     gpu_uniques = False if 'gpu_uniques' not in runtime_args else runtime_args['gpu_uniques']
 31 | 
 32 |     # define globals
 33 |     print('Parsing chromosomes')
 34 |     chromosomes = parse_chromosomes(runtime_args)
 35 |     
 36 |     print('Parsing chromosome indices')
 37 |     chrom_indices = compute_chrom_indices(runtime_args) if runtime_args[
 38 |         'chrom_indices'
 39 |     ] is None else pickle.load(open(runtime_args['chrom_indices'],'rb'))
 40 |     
 41 |     print('Parsing cell types')
 42 |     cell_type = runtime_args['cell_type']
 43 |     cell_type_index = parse_cell_types(runtime_args)
 44 | 
 45 |     print('Parsing remaining global variables')
 46 |     chrom_start_end = np.load(os.path.join(runtime_args['schic_directory'],'chrom_start_end.npy'))
 47 | 
 48 |     nearest_neighbors = None
 49 |         
 50 |     if 'nearest_neighbor_override' in runtime_args and runtime_args['nearest_neighbor_override'] is not None:
 51 |         print('Using nearest neighbor override')
 52 |         nearest_neighbors = np.load(runtime_args['nearest_neighbor_override']) if runtime_args['nearest_neighbor_override'] is not None else parse_nearest_neighbors(runtime_args)
 53 |     else:
 54 |         nearest_neighbors = parse_nearest_neighbors(runtime_args)
 55 |     
 56 |     batch_size = runtime_args['batch_size']
 57 |     n_epochs = runtime_args['epochs']
 58 | 
 59 |     # per chromosome loop
 60 |     for chrom in chromosomes:
 61 | 
 62 |         # if embedding already generated, skip
 63 |         if os.path.exists(
 64 |             os.path.join(runtime_args['data_directory'],'{0}_embeddings.npy'.format(chrom))
 65 |         ):
 66 |             continue
 67 | 
 68 |         print('Processing chromosome {0}'.format(chrom))
 69 |         impute_path = runtime_args['chromosomes'][chrom]['imputed']        
 70 | 
 71 |         # compute O/E matrices
 72 | 
 73 |         oem_override = None if 'oe_matrices' not in runtime_args['chromosomes'][chrom] else runtime_args['chromosomes'][chrom]['oe_matrices']
 74 | 
 75 |         OEMs = extract_OEMs(
 76 |             os.path.join(runtime_args['schic_directory'],impute_path),
 77 |             cell_type_index,
 78 |             chrom_indices[chrom],
 79 |             None,
 80 |             runtime_args['chromosomes'][chrom]['integer'],
 81 |             chrom_start_end,
 82 |             save_path=None,
 83 |             eps=runtime_args['eps']
 84 |         ) if oem_override is None else np.load(oem_override)['contact_maps']
 85 |         gc.collect()
 86 | 
 87 |         # random walk
 88 |         OEMs = torch.tensor(OEMs)
 89 |         corr_OEMs = torch.zeros_like(OEMs)
 90 | 
 91 |         for i in trange(len(OEMs)):
 92 |             corr_OEMs[i] = torch.nan_to_num(torch.corrcoef(OEMs[i]))
 93 |             corr_OEMs[i].fill_diagonal_(0)
 94 | 
 95 |         corr_OEMs = corr_OEMs.type(torch.bfloat16)
 96 | 
 97 |         gc.collect()
 98 |         
 99 |         all_cell_chrom_samples = sample_chrom(chrom,corr_OEMs,np.arange(len(corr_OEMs)),nearest_neighbors,num_walks=num_walks)
100 | 
101 |         del corr_OEMs
102 |         gc.collect()
103 |         torch.cuda.empty_cache()
104 | 
105 |         # label calibration
106 |         all_continuous_pairs,all_continuous_labels = post_process_samples(
107 |             all_cell_chrom_samples,
108 |             OEMs,
109 |             nearest_neighbors,
110 |             neighbor_contacts=neighbor_contacts
111 |         )
112 | 
113 |         all_continuous_pairs,all_continuous_labels = prep_pairs_labels(
114 |             all_continuous_pairs,
115 |             all_continuous_labels,
116 |             OEMs[0].shape[0],
117 |             np.arange(len(OEMs))
118 |         )
119 |         
120 |         del all_cell_chrom_samples
121 |         gc.collect()
122 |         torch.cuda.empty_cache()
123 | 
124 |         # embedding
125 |         output_file = os.path.join(runtime_args['data_directory'], '{0}_embeddings'.format(chrom))
126 | 
127 |         embed_single_cells_unified(
128 |             all_continuous_pairs,
129 |             all_continuous_labels,
130 |             OEMs,
131 |             output_file,
132 |             epochs=n_epochs,
133 |             cell_nums=None,
134 |             batch_size=batch_size,
135 |             verbose=True,
136 |             prepped=True
137 |         )
138 | 
139 |         del all_continuous_labels,all_continuous_pairs
140 | 
141 |         gc.collect()
142 |         torch.cuda.empty_cache()
143 | 
144 |     # cluster on all embeddings
145 |     print('Clustering')
146 |     scghost_clustering(runtime_args)


--------------------------------------------------------------------------------
/utilities/parsers.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import torch
  4 | import h5py
  5 | import pickle
  6 | import gc
  7 | import os
  8 | 
  9 | from utilities.chrom_sizes import chrom_sizes
 10 | from utilities.helper import to_cuda
 11 | from sklearn.neighbors import NearestNeighbors
 12 | 
 13 | def parse_config(config_filepath):
 14 |     with open(config_filepath) as config_file:
 15 |         config_data = json.load(config_file)
 16 |         
 17 |         return config_data
 18 |         
 19 | def parse_higashi_scab(runtime_args):
 20 |     scAB = h5py.File(runtime_args['higashi_scab_path'])
 21 |     chromosomes = parse_chromosomes(runtime_args)
 22 | 
 23 |     scAB_chrom = np.array(scAB['compartment']['bin']['chrom']).astype(str)
 24 |     scAB_start = np.array(scAB['compartment']['bin']['start'])
 25 | 
 26 |     hig_scab = []
 27 | 
 28 |     num_cells = 0
 29 | 
 30 |     for i in range(len(scAB['compartment'])):
 31 |         if 'cell_%d' % i in scAB['compartment']:
 32 |             num_cells += 1
 33 | 
 34 |     for cn in range(num_cells):
 35 |         hig_scab.append(scAB['compartment']['cell_%d' % cn])
 36 | 
 37 |     hig_scab = np.array(hig_scab)
 38 | 
 39 |     return (hig_scab,scAB_chrom,scAB_start)
 40 | 
 41 | def parse_chrom_embeds(runtime_args,cuda=True):
 42 | 
 43 |     chromosomes = parse_chromosomes(runtime_args)
 44 |     gpu_caching = False if 'cluster_gpu_caching' not in runtime_args else runtime_args['cluster_gpu_caching']
 45 | 
 46 |     hig_scab,scAB_chrom,scAB_start = parse_higashi_scab(runtime_args)
 47 | 
 48 |     N = len(hig_scab)
 49 | 
 50 |     chrom_embeds = {}
 51 |     chrom_highlow = {}
 52 | 
 53 |     resolution = runtime_args['resolution']
 54 | 
 55 |     for chrom in chromosomes:
 56 |         
 57 |         ci_path = os.path.join(runtime_args['data_directory'],'chrom_indices.pkl') if runtime_args['chrom_indices'] is None else runtime_args['chrom_indices']
 58 |         chrom_indices = pickle.load(
 59 |             open(ci_path,'rb')
 60 |         )['{0}'.format(chrom)]
 61 | 
 62 |         scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(chrom))[0]
 63 |         _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // resolution,chrom_indices,return_indices=True)
 64 |         scab_indices = scab_chrom_indices[scab_crop]
 65 |         scghost_indices = chrom_indices[scghost_crop]
 66 | 
 67 |         scab_highidx = np.argsort(hig_scab[:,scab_indices],axis=1)[:,-25:]
 68 |         scab_lowidx = np.argsort(hig_scab[:,scab_indices],axis=1)[:,:25]
 69 |         
 70 |         chrom_highlow['{0}'.format(chrom)] = {
 71 |             'high' : to_cuda(torch.tensor(scab_highidx)) if gpu_caching else torch.tensor(scab_highidx),
 72 |             'low' : to_cuda(torch.tensor(scab_lowidx)) if gpu_caching else torch.tensor(scab_lowidx)
 73 |         }
 74 |         
 75 |         embedding_flag = ('embeddings' in runtime_args['chromosomes'][chrom])
 76 | 
 77 |         embed_path = os.path.join(
 78 |             runtime_args['data_directory'],'{0}_embeddings.npy'.format(chrom)
 79 |         )
 80 |         if embedding_flag and runtime_args['chromosomes'][chrom]['embeddings'] is not None:
 81 |             embed_path = runtime_args['chromosomes'][chrom]['embeddings']
 82 | 
 83 |         scembeds = np.load(embed_path)
 84 |         scembeds = scembeds[:,scghost_crop]
 85 |         
 86 |         chrom_embeds['{0}'.format(chrom)] = to_cuda(torch.tensor(scembeds)) if gpu_caching else torch.tensor(scembeds)
 87 | 
 88 |         gc.collect()
 89 | 
 90 |     return {
 91 |         'embeds':chrom_embeds,
 92 |         'highlow':chrom_highlow,
 93 |         'N':N,
 94 |     }
 95 | 
 96 | def parse_chromosomes(runtime_args):
 97 | 
 98 |     sizes = chrom_sizes(runtime_args['chrom_sizes'])
 99 |     chromosomes = runtime_args['chromosomes']
100 |     
101 |     # deprecate this if condition
102 |     if chromosomes == 'autosomes':
103 |         chrom_list = []
104 | 
105 |         for chrom in sizes:
106 |             chrom_num = chrom[3:]
107 |             if chrom_num.isnumeric():
108 |                 chrom_list.append(int(chrom_num))
109 |         chromosomes = np.array(chrom_list)
110 |     else:
111 |         chromosomes = np.array([c for c in chromosomes])
112 |     
113 |     return chromosomes
114 | 
115 | def parse_nearest_neighbors(runtime_args):
116 | 
117 |     cell_type = runtime_args['cell_type']
118 |     label_info = pickle.load(open(runtime_args['label_info']['path'],'rb')) if runtime_args['label_info'] is not None else None
119 | 
120 |     embeddings = np.load(runtime_args['embeddings_path'])
121 |     
122 |     if label_info is not None and cell_type is not None:
123 |         cell_type_key = runtime_args['label_info']['cell_type_key']
124 |         cell_types = np.array(label_info[cell_type_key]).astype(str)
125 |         cell_type_index = np.where(cell_types == cell_type)
126 | 
127 |         embeddings = embeddings[cell_type_index]
128 | 
129 |     nbrs = NearestNeighbors(n_neighbors=6).fit(embeddings)
130 |     _,indices = nbrs.kneighbors(embeddings)
131 | 
132 |     return indices
133 | 
134 | def parse_cell_types(runtime_args):
135 | 
136 |     if runtime_args['label_info'] is None:
137 |         return
138 |     
139 |     label_info = pickle.load(open(runtime_args['label_info']['path'],'rb'))
140 | 
141 |     cell_type = runtime_args['cell_type']
142 | 
143 |     if cell_type is None:
144 |         return
145 |     
146 |     cell_type_filter = cell_type is not None
147 |     cell_types = np.array(label_info[runtime_args['label_info']['cell_type_key']]).astype(str)
148 |     cell_type_index = np.where(cell_types == cell_type)[0] if cell_type_filter else np.arange(len(cell_types))
149 | 
150 |     return cell_type_index


--------------------------------------------------------------------------------
/modules/analysis.py:
--------------------------------------------------------------------------------
  1 | # sort chromosome subcompartments using single cell AB compartments from Higashi
  2 | import h5py
  3 | import os
  4 | import seaborn as sns
  5 | 
  6 | os.environ["OMP_NUM_THREADS"] = "10"
  7 | 
  8 | import numpy as np
  9 | from tqdm import trange, tqdm
 10 | import pickle
 11 | import pandas as pd
 12 | import argparse
 13 | from umap import UMAP
 14 | from fbpca import pca
 15 | from sklearn.preprocessing import StandardScaler, quantile_transform
 16 | from sklearn.decomposition import PCA
 17 | from scipy.stats import rankdata
 18 | from concurrent.futures import ProcessPoolExecutor, as_completed
 19 | import matplotlib.pyplot as plt
 20 | import json
 21 | 
 22 | def get_expected(M,eps=1e-8):
 23 |     E = np.zeros_like(M)
 24 |     l = len(M)
 25 | 
 26 |     for i in range(M.shape[0]):
 27 |         contacts = np.diag(M,i)
 28 |         expected = contacts.sum() / (l-i)
 29 |         # expected = np.median(contacts)
 30 |         x_diag,y_diag = np.diag_indices(M.shape[0]-i)
 31 |         x,y = x_diag,y_diag+i
 32 |         E[x,y] = expected
 33 | 
 34 |     E += E.T
 35 |     E = np.nan_to_num(E) + eps
 36 |     
 37 |     return E
 38 |     
 39 | def get_oe_matrix(M):
 40 |     E = get_expected(M)
 41 |     oe = np.nan_to_num(M / E)
 42 |     np.fill_diagonal(oe,1)
 43 |     
 44 |     return oe
 45 | 
 46 | # tailored for k=5 and pfc
 47 | def prep_scatterplot(embeddings_dir,chrom_indices_file,scAB_file,output_file='tutorial_embeds.hdf5'):
 48 | 
 49 |     chrom_indices = pickle.load(open(chrom_indices_file,'rb'))
 50 |     stacked_pcs = []
 51 | 
 52 |     for chrom_num in range(1,23):
 53 |         chrom_indices = pickle.load(open('/mnt/e/data/scghost_pfc_output/chrom_indices.pkl','rb'))['%d' % chrom_num]
 54 | 
 55 |         sparse_M = np.load('/mnt/e/data/pfc/chr%d_sparse_adj.npy' % chrom_num,allow_pickle=True)
 56 |         pseudo_bulk = sparse_M.sum(axis=0).toarray()
 57 |         cov = np.sqrt(pseudo_bulk.sum(axis=1))
 58 |         pseudo_bulk /= cov[None,:]
 59 |         pseudo_bulk /= cov[:,None]
 60 |         pseudo_bulk = np.nan_to_num(pseudo_bulk)[chrom_indices][:,chrom_indices]
 61 |         pseudo_OE = get_oe_matrix(pseudo_bulk)
 62 | 
 63 |         Rpool = np.nan_to_num(np.corrcoef(pseudo_OE))
 64 |         Rpoolmean = Rpool.mean(axis=0,keepdims=True)
 65 |         Rpool = Rpool - Rpoolmean
 66 |         _,_,V = np.linalg.svd(Rpool)
 67 | 
 68 |         Es = np.load(os.path.join(embeddings_dir,f'/mnt/e/data/scghost_pfc_output/{chrom_num}_embeddings.npy'))
 69 |         embedding_corrs = np.zeros((Es.shape[0],Es.shape[1],Es.shape[1]))
 70 | 
 71 |         num_cells = len(Es)
 72 | 
 73 |         for i in trange(num_cells):
 74 |             embedding_corrs[i] = np.corrcoef(Es[i])
 75 | 
 76 |         pcs = np.zeros((Es.shape[0],Es.shape[1]))
 77 | 
 78 |         for i,ec in enumerate(embedding_corrs):
 79 |             tec = ec - Rpoolmean
 80 |             pc = tec.dot(V[0,:].T)
 81 |             pcs[i] = pc
 82 |             
 83 |         stacked_pcs.append(pcs)
 84 |         
 85 |     stacked_pcs = np.hstack(stacked_pcs)
 86 | 
 87 |     with h5py.File(output_file,'w') as f:
 88 |         f.create_group('compartment')
 89 |         f['compartment'].create_group('bin')
 90 | 
 91 |         for i in range(num_cells):
 92 |             f['compartment'].create_dataset('cell_%d' % i,data=stacked_pcs[i])
 93 | 
 94 | 
 95 | def get_config(config_path = "./config.jSON"):
 96 |     c = open(config_path,"r")
 97 |     return json.load(c)
 98 | 
 99 | 
100 | def parse_args():
101 |     parser = argparse.ArgumentParser(description="Higashi single cell compartment calling")
102 |     parser.add_argument('-c', '--config', type=str, default="./config.JSON")
103 |     
104 |     return parser.parse_args()
105 | 
106 | 
107 | def get_palette(label_order, label_name=None, config=None):
108 |     try:
109 |         palette = config['vis_palette'][label_name]
110 |     except:
111 |         pal1 = list(sns.color_palette("Paired"))
112 |         pal2 = list(sns.color_palette("Set2"))
113 |         pal3 = list(sns.color_palette("husl", 12))
114 |         # pal = pal1 + pal2 + pal3 + pal1
115 |         # pal = pal1 + pal3 + pal2
116 |         pal_all = pal1 + pal2 + pal3 + pal1 + pal2 + pal3
117 |         if len(label_order) <= 10:
118 |             palette = list([f'C{_}' for _ in range(len(label_order))])
119 |         else:
120 |             palette = pal_all[:len(label_order)]
121 |     return palette
122 | 
123 | 
124 | def sc_compartment2embedding(embeds_path,data_dir,output_file="tutorial_scatterplot.pdf",extra="", save_name=""):
125 |     label_info = pickle.load(open(os.path.join(data_dir, "label_info.pickle"), "rb"))
126 |     label = np.array(label_info["cluster label"])
127 |     print(label)
128 |     
129 |     ids = np.arange(4238)
130 |     label = label[ids]
131 |     total_feats = []
132 |     
133 |     with h5py.File(embeds_path, "r") as cp_f:
134 |         print(cp_f.keys())
135 |         cp = cp_f['compartment']
136 |         
137 |         for id_ in trange(len(label)):
138 |             v = np.array(cp['cell_%d' % id_])
139 |             total_feats.append(v)
140 |     
141 |     feats = np.stack(total_feats, axis=0)
142 |     print(feats.shape)
143 |     
144 |     pal = get_palette(np.unique(label))
145 |     
146 |     pal = {'L2/3': '#e51f4e', 'L4': '#45af4b', 'L5': '#ffe011', 'L6': '#0081cc',
147 |            'Ndnf': '#ff7f35', 'Vip': '#951eb7', 'Pvalb': '#4febee',
148 |            'Sst': '#ed37d9', 'Astro': '#d1f33c', 'ODC': '#f9bdbb',
149 |            'OPC': '#067d81', 'MG': '#e4bcfc', 'MP': '#ab6c1e',
150 |            "Endo": '#780100'}
151 |     
152 | 
153 |     
154 |     temp = quantile_transform(feats, output_distribution='uniform', n_quantiles=int(1.0 * feats.shape[0]))
155 |     print(feats.shape)
156 |     size = 32
157 |     pca = PCA(n_components=size)
158 |     temp = pca.fit_transform(temp)
159 | 
160 |     vec = UMAP(n_components=2).fit_transform(temp)
161 |     fig, ax = plt.subplots(figsize=(7, 5))
162 |     sns.scatterplot(x=vec[:, 0], y=vec[:, 1], hue=label, linewidth=0, s=2, alpha=1.0, palette=pal)
163 |     #
164 |     handles, labels = ax.get_legend_handles_labels()
165 |     labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
166 |     ax.legend(handles=handles, labels=labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
167 |     plt.tight_layout()
168 |     plt.savefig(output_file, dpi=300)
169 |     plt.close('all')
170 | 
171 |     return (vec,label,pal)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview of scGHOST
 2 | 
 3 | ![Overview of scGHOST](scghost_overview.png)
 4 | 
 5 | scGHOST is an unsupervised single-cell subcompartment annotation method based on graph embedding with constrained random walk sampling.
 6 | scGHOST is designed to be run on a single-cell Hi-C (scHi-C) dataset which has undergone imputation by [Higashi](https://github.com/ma-compbio/Higashi) ([Zhang et al. 2022](https://www.nature.com/articles/s41587-021-01034-y)).
 7 | scGHOST assigns embeddings to genomic loci in the genomes of individual cells by viewing scHi-C as graphs whose vertices are genomic loci and edges are the contact frequencies among loci.
 8 | While scGHOST is developed for scHi-C data, it can also identify single-cell subcompartments in single-cell genome imaging data.
 9 | 
10 | # Running scGHOST
11 | 
12 | ## Input data
13 | 
14 | scGHOST uses the outputs from [Higashi](https://github.com/ma-compbio/Higashi) as its inputs.
15 | Specifically, it requires the scHi-C imputations (hdf5 format), per-cell embeddings (numpy format), sparse raw scHi-C adjacency maps (numpy format), the scA/B scores (hdf5 format), and the label info file (pickle format) describing the cell types corresponding to each cell in the dataset.
16 | 
17 | ## Installation
18 | 
19 | Before installing any Python packages, we strongly recommend using Anaconda (please refer to the [Anaconda](https://anaconda.org/) webpage for `conda` installation instructions) to create a python 3.10 environment using the following command:
20 | 
21 | `conda install --name scghost python=3.10`
22 | 
23 | After creating the environment, activate it using:
24 | 
25 | `conda activate scghost`
26 | 
27 | ### Dependencies
28 | 
29 | #### Conda installations
30 | - PyTorch (2.1.0) with CUDA (11.8)
31 | - Scikit-learn (latest)
32 | - h5py
33 | #### Pip installations
34 | - [cuML for CUDA 11.8](https://docs.rapids.ai/install#selector)
35 | - [Thread-pool Controls](https://pypi.org/project/threadpoolctl/) (> 3)
36 | 
37 | Users can install scGHOST dependencies using the `conda` or `pip` commands following the specifications above.
38 | 
39 | Systems without a CUDA-capable GPU can also install scGHOST using the same dependencies and installing PyTorch for CPU only, but will have to modify the source code in `modules/clustering.py` to use `SKMeans` instead of `KMeans` under the `scghost_clustering_reworked` function. We may add a flag in the config file to run CPU only instead, but from our experience running scGHOST on the CPU only takes far longer than on a GPU and is not recommended.
40 | 
41 | ## Hardware Requirements
42 | 
43 | scGHOST can use up to 40 GB of memory for a single-cell dataset of 4,238 cells.
44 | Considering operating system overhead, we recommend running scGHOST on a machine with at least 64 GB of memory to avoid poor performance or out-of-memory errors at runtime.
45 | 
46 | scGHOST was developed on a system with a 12-core 12th generation Intel CPU, an Nvidia RTX 3090 GPU with 24GB of VRAM, and 64GB of system memory. With GPU caching enabled, scGHOST uses a maximum of 15 GB of VRAM on the PFC dataset. With GPU caching disabled, VRAM becomes less of a limiting factor and scGHOST should run on any CUDA-capable GPU with at least 4 GB of VRAM.
47 | 
48 | ## Usage
49 | 
50 | Users can run scGHOST using the following command:
51 | 
52 | `python scghost.py --config <configuration.json>`
53 | 
54 | Sample JSON config files for scGHOST have been provided.
55 | 
56 | `configuration` is the filepath to a custom configuration file adhering to the JSON format for scGHOST. By default, scGHOST uses the included config.json file, which can be modified to the user's specifications.
57 | 
58 | **Note**: users may run into a `RuntimeWarning` after the clustering step. This is normal behavior and should not affect the accuracy of results.
59 | 
60 | ## Runtime
61 | scGHOST was run on a machine with a 12-core 12th generation Intel CPU and Nvidia RTX 3090 24GB GPU.
62 | From scratch, scGHOST takes about 2 hours to run on the sciHi-C GM12878 dataset and about 4 hours to run on the human prefrontal cortex dataset.
63 | 
64 | ## Configuration file
65 | 
66 | - `schic_directory` : the directory containing Higashi-imputed single-cell Hi-C maps.
67 | - `label_info` : `label_info.pickle` file following the [format in Higashi](https://github.com/ma-compbio/Higashi/wiki/Input-Files).
68 |   - `path` : the file path of the `label_info.pickle` file
69 |   - `cell_type_key` : the key in `label_info.pickle` with a list of the cell types in the dataset
70 | - `data_directory` : the output directory of scGHOST
71 | - `chromosomes` : the list of chromosomes to apply scGHOST to. default: autosomes
72 | - `chrom_sizes` : file path to the chromosome sizes file. default: `data/hg38.chrom.sizes`
73 | - `chrom_indices` : file path to chrom indices if previously computed. Development flag to save time over multiple runs on the same dataset. Default: `null`
74 | - `embeddings_path` : file path to the Higashi embeddings `.npy` file for each cell in the dataset
75 | - `higashi_scab_path` : file path to Higashi scA/B scores `.h5` file
76 | - `cell_type` : the cell type in the dataset to apply scGHOST on; use `null` to apply scGHOST to all cell types in the dataset. default: `null`
77 | - `random_walk` : random walk parameters
78 |   - `num_walks` : number of random walks per iteration. default: 50
79 |   - `ignore_top` : the top and bottom percentile to be ignored, to remove extreme values in the input matrix. default: 0.02
80 |   - `top_percentile` : the top percentiles within which random walks are performed. default: 0.25
81 | - `eps` : small float value to prevent dividing by zero in some functions. default: 1e-8
82 | - `num_clusters` : number of clusters to partition chromosomes into
83 | - `neighbor_contacts` : determine whether to use the average of nearest neighbor contacts as the target label during node embedding.
84 | - `nearest_neighbor_override` : use a custom numpy array to define nearest neighbors. The format should be an `N x (k+1)` array with `N` denoting the number of cells in the dataset and `k` denoting the number of nearest neighbors. Row `i` in the array should contain entries denoting which cells are the nearest neighbors of cell `i`.
85 | - `cluster_gpu_caching` : toggle caching chromosome embeddings on the GPU prior to clustering to reduce CPU overhead converting embedding vectors to cuda variables. We recomend disabling this if your GPU memory is less than 16 GB.
86 | - `gpu_uniques` : determine whether to use the GPU to compute unique random walk samples. On machines with higher CPU core counts, CPU processing may be faster than GPU processing.
87 | - `kmeans_init` : the `n_init` parameter in scikit-learn/cuML's `KMeans`. We set this value at a default of 1 to reduce clustering runtime.
88 | 
89 | ## Tutorials
90 | 
91 | Please follow our tutorial notebooks in the root directory for examples on how to run scGHOST with and without first running Higashi. For a sample run of scGHOST, users can download the smaller WTC-11 dataset [here](http://genome.compbio.cs.cmu.edu:8008/~kxiong/data/scghost/wtc11/). After downloading the sample data, please change the `sample_configs/config_wtc.json` configuration file accordingly to point to the correct paths and run the following command:
92 | 
93 | `python scghost.py --config sample_configs/config_wtc.json`
94 | 
95 | ## Contact
96 | Please email jianma@cs.cmu.edu or raise an issue in the github repository with any questions about installation or usage or any encountered bugs.
97 | 


--------------------------------------------------------------------------------
/modules/clustering.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import pickle
  5 | import gc
  6 | 
  7 | from hmmlearn.hmm import GaussianHMM
  8 | from sklearn.preprocessing import quantile_transform as sk_quantile_transform
  9 | from sklearn.cluster import KMeans
 10 | from utilities.parsers import parse_chrom_embeds, parse_chromosomes, parse_chromosomes, parse_higashi_scab, parse_cell_types
 11 | from utilities.helper import to_cuda
 12 | from sklearn.cluster import KMeans as SKMeans
 13 | from tqdm import trange, tqdm
 14 | from cuml import KMeans
 15 | 
 16 | DEFAULT_KMEANS_INIT = 10
 17 | 
 18 | def quantile_transform(data, n_quantiles=None):
 19 | 
 20 |     nq = n_quantiles if n_quantiles is not None else len(data)
 21 | 
 22 |     sorted_data, sort_id = torch.sort(data)
 23 |     data[sort_id] = torch.arange(data.shape[0],device=data.device,dtype=data.dtype)
 24 |     data = torch.floor(data / data.shape[0] * nq) / nq
 25 |     return data
 26 | 
 27 | def scghost_clustering(runtime_args):
 28 |     embed_data = parse_chrom_embeds(runtime_args)
 29 |     chromosomes = parse_chromosomes(runtime_args)
 30 |     cell_type_index = parse_cell_types(runtime_args)
 31 |     gpu_caching = False if 'cluster_gpu_caching' not in runtime_args else runtime_args['cluster_gpu_caching']
 32 | 
 33 |     kmeans_init = DEFAULT_KMEANS_INIT if 'kmeans_init' not in runtime_args else runtime_args['kmeans_init']
 34 | 
 35 |     chrom_embeds = embed_data['embeds']
 36 |     chrom_highlow = embed_data['highlow']
 37 | 
 38 |     N = embed_data['N']
 39 |     bar = trange(N) if cell_type_index is None else tqdm(cell_type_index)
 40 |     # bar = trange(N) if cell_type_index is None else tqdm(range(5))
 41 | 
 42 |     cell_labels = []
 43 |     cell_labels_transpose = []
 44 | 
 45 |     for cn in bar:
 46 |     # for cn in trange(25):
 47 |         
 48 |         inter_matrix = []
 49 |         
 50 |         for ii in range(0,len(chromosomes),2):
 51 |             chrom1 = chromosomes[ii]
 52 |             embed1 = chrom_embeds['{0}'.format(chrom1)][cn]
 53 |             embed1 = embed1 if gpu_caching else to_cuda(embed1)
 54 |             corr1 = torch.corrcoef(embed1)
 55 | 
 56 |             hi1 = chrom_highlow['{0}'.format(chrom1)]['high'][cn]
 57 |             lo1 = chrom_highlow['{0}'.format(chrom1)]['low'][cn]
 58 |             
 59 |             slc1 = corr1[hi1] - corr1[lo1]
 60 | 
 61 |             row = []
 62 |             
 63 |             for jj in range(1,len(chromosomes),2):
 64 |                 chrom2 = chromosomes[jj]
 65 |                 embed2 = chrom_embeds['{0}'.format(chrom2)][cn]
 66 |                 embed2 = embed2 if gpu_caching else to_cuda(embed2)
 67 |                 corr2 = torch.corrcoef(embed2)
 68 | 
 69 |                 hi2 = chrom_highlow['{0}'.format(chrom2)]['high'][cn]
 70 |                 lo2 = chrom_highlow['{0}'.format(chrom2)]['low'][cn]
 71 |             
 72 |                 slc2 = corr2[hi2] - corr2[lo2]
 73 |                 
 74 |                 op = slc1.mean(dim=0)[:,None] * slc2.mean(dim=0)[None]
 75 | 
 76 |                 opf = op.flatten()
 77 |                 opf = quantile_transform(opf,n_quantiles=1000)
 78 |                 opq = opf.reshape(op.shape)
 79 |                 
 80 |                 row.append(opq)
 81 |                 
 82 |             row = torch.hstack(row)
 83 |             inter_matrix.append(row)
 84 |             
 85 |         # inter_matrix = torch.from_numpy(np.vstack(inter_matrix)).cuda()
 86 |         inter_matrix = torch.vstack(inter_matrix)
 87 |         
 88 |         L = KMeans(n_clusters=5,n_init=kmeans_init).fit_predict(inter_matrix)
 89 |         LT = KMeans(n_clusters=5,n_init=kmeans_init).fit_predict(inter_matrix.T)
 90 | 
 91 |         cell_labels.append(L.get())
 92 |         cell_labels_transpose.append(LT.get())
 93 |         # gc.collect()
 94 |         
 95 |     cell_labels = np.array(cell_labels)
 96 |     cell_labels_transpose = np.array(cell_labels_transpose)
 97 | 
 98 |     # align using hig_scab
 99 | 
100 |     hig_scab,scAB_chrom,scAB_start = parse_higashi_scab(runtime_args)
101 | 
102 |     cmap = []
103 |     rmap = []
104 |     chrom_hig = {}
105 |     cropped_indices = {}
106 | 
107 |     data_dir = runtime_args['data_directory']
108 | 
109 |     for ii in range(0,len(chromosomes),2):
110 |         chrom = chromosomes[ii]
111 |         
112 |         chrom_indices = pickle.load(open(os.path.join(data_dir,'chrom_indices.pkl'),'rb'))['{0}'.format(chrom)]
113 |         scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(runtime_args['chromosomes'][chrom]['integer']))[0]
114 |         _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // 500000,chrom_indices,return_indices=True)
115 |         scab_indices = scab_chrom_indices[scab_crop]
116 |         scghost_indices = chrom_indices[scghost_crop]
117 |         cropped_indices['{0}'.format(chrom)] = scghost_indices
118 |         
119 |         rmap.append(
120 |             np.vstack((
121 |                 np.ones(len(scghost_indices)) * runtime_args['chromosomes'][chrom]['integer'],
122 |                 np.arange(len(scghost_indices)),
123 |                 scghost_crop,
124 |                 scghost_indices
125 |             )).T
126 |         )
127 |         
128 |         chrom_hig['{0}'.format(chrom)] = hig_scab[:,scab_indices]
129 |         
130 |     for ii in range(1,len(chromosomes),2):
131 |         chrom = chromosomes[ii]
132 |         
133 |         chrom_indices = pickle.load(open(os.path.join(data_dir,'chrom_indices.pkl'),'rb'))['{0}'.format(chrom)]
134 |         scab_chrom_indices = np.where(scAB_chrom == 'chr{0}'.format(runtime_args['chromosomes'][chrom]['integer']))[0]
135 | 
136 |         _,scab_crop,scghost_crop = np.intersect1d(scAB_start[scab_chrom_indices] // 500000,chrom_indices,return_indices=True)
137 |         scab_indices = scab_chrom_indices[scab_crop]
138 |         scghost_indices = chrom_indices[scghost_crop]
139 |         cropped_indices['{0}'.format(chrom)] = scghost_indices
140 |         
141 |         cmap.append(
142 |             np.vstack((
143 |                 np.ones(len(scghost_indices)) * runtime_args['chromosomes'][chrom]['integer'],
144 |                 np.arange(len(scghost_indices)),
145 |                 scghost_crop,
146 |                 scghost_indices
147 |             )).T
148 |         )
149 |         
150 |         chrom_hig['{0}'.format(chrom)] = hig_scab[:,scab_indices]
151 | 
152 |     rmap = np.vstack(rmap)
153 |     cmap = np.vstack(cmap)
154 | 
155 |     pickle.dump(cropped_indices,open(os.path.join(data_dir,'cropped_indices.pkl'),'wb'))
156 | 
157 |     chrom_sorted_labels = {}
158 | 
159 |     for chrom in chromosomes:
160 |         chrom_sorted_labels['{0}'.format(chrom)] = []
161 |         
162 |         for i in bar:
163 |             ab = chrom_hig['{0}'.format(chrom)][i]
164 |             
165 |             m = rmap if runtime_args['chromosomes'][chrom]['integer'] % 2 == 1 else cmap
166 |             
167 |             idx = np.where(m[:,0] == runtime_args['chromosomes'][chrom]['integer'])[0]
168 |             lset = cell_labels if runtime_args['chromosomes'][chrom]['integer'] % 2 == 1 else cell_labels_transpose
169 |             
170 |             lbls = lset[i,idx]
171 |             lbls_ab = np.zeros(5)
172 |             
173 |             for k in range(5):
174 |                 ii = np.where(lbls == k)[0]
175 | 
176 |                 lbls_ab[k] = ab[ii].mean()
177 |                 
178 |             lbls_order = lbls_ab.argsort()[::-1]
179 |             lbls_sorted = lbls.copy()
180 |             
181 |             for k in range(5):
182 |                 lbls_sorted[lbls == lbls_order[k]] = k
183 |             
184 |             chrom_sorted_labels['{0}'.format(chrom)].append(lbls_sorted)
185 |             
186 |         chrom_sorted_labels['{0}'.format(chrom)] = np.array(chrom_sorted_labels['{0}'.format(chrom)])
187 |         
188 |     pickle.dump(chrom_sorted_labels,open(os.path.join(data_dir,'labels.pkl'),'wb'))


--------------------------------------------------------------------------------
/tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Generate JSON config file"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Enter scGHOST settings"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# filepath settings\n",
 24 |     "schic_directory = \"/directory/of/higashi/imputed/maps\"\n",
 25 |     "label_info_path = \"/path/to/label_info.pickle\"\n",
 26 |     "label_info_cell_type_key = \"cluster label\"\n",
 27 |     "data_directory = \"/directory/to/save/scghost/outputs/\"\n",
 28 |     "\n",
 29 |     "NUM_CHROMOSOMES = 22\n",
 30 |     "chromosomes = {chrom_num : {\n",
 31 |     "    'adj' : f'chr{chrom_num}_sparse_adj.npy',\n",
 32 |     "    'imputed' : f'chr{chrom_num}_exp1_nbr_5_impute.hdf5',\n",
 33 |     "    'integer' : chrom_num,\n",
 34 |     "} for chrom_num in range(1,NUM_CHROMOSOMES+1)}\n",
 35 |     "\n",
 36 |     "chrom_sizes = 'data/hg19.chrom.sizes'\n",
 37 |     "chrom_indices = None\n",
 38 |     "embeddings_path = \"/path/to/exp1_0_origin.npy\"\n",
 39 |     "higashi_scab_path = \"/path/to/higashi/scAB.hdf5\"\n",
 40 |     "cell_type = None\n",
 41 |     "\n",
 42 |     "# hyperparameters\n",
 43 |     "random_walk_num_walks = 50\n",
 44 |     "random_walk_ignore_top = 0.02\n",
 45 |     "random_walk_top_percentile = 0.25\n",
 46 |     "eps = 1e-8\n",
 47 |     "num_clusters = 5\n",
 48 |     "batch_size = 16\n",
 49 |     "epochs = 5\n",
 50 |     "resolution = 500000\n",
 51 |     "neighbor_contacts = False\n",
 52 |     "kmeans_init = 1\n",
 53 |     "\n",
 54 |     "# misc settings\n",
 55 |     "nearest_neighbor_override = None\n",
 56 |     "gpu_uniques = True\n",
 57 |     "cluster_gpu_caching = True"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Generate python dictionary"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "settings_dict = {\n",
 74 |     "    'schic_directory': schic_directory,\n",
 75 |     "    'label_info': {\n",
 76 |     "        'path': label_info_path,\n",
 77 |     "        'cell_type_key': label_info_cell_type_key,\n",
 78 |     "    },\n",
 79 |     "    'data_directory': data_directory,\n",
 80 |     "    'chromosomes': chromosomes,\n",
 81 |     "    'chrom_sizes': chrom_sizes,\n",
 82 |     "    'chrom_indices': chrom_indices,\n",
 83 |     "    'embeddings_path': embeddings_path,\n",
 84 |     "    'higashi_scab_path': higashi_scab_path,\n",
 85 |     "    'cell_type': cell_type,\n",
 86 |     "    'random_walk': {\n",
 87 |     "        'num_walks': random_walk_num_walks,\n",
 88 |     "        'ignore_top': random_walk_ignore_top,\n",
 89 |     "        'top_percentile': random_walk_top_percentile,\n",
 90 |     "    },\n",
 91 |     "    'epis': eps,\n",
 92 |     "    'num_clusters': num_clusters,\n",
 93 |     "    'batch_size': batch_size,\n",
 94 |     "    'epochs': epochs,\n",
 95 |     "    'resolution': resolution,\n",
 96 |     "    'neighbor_contacts': neighbor_contacts,\n",
 97 |     "    'nearest_neighbor_override': nearest_neighbor_override,\n",
 98 |     "    'gpu_uniques': gpu_uniques,\n",
 99 |     "    'cluster_gpu_caching': cluster_gpu_caching,\n",
100 |     "    'kmeans_init': kmeans_init,\n",
101 |     "}\n",
102 |     "\n",
103 |     "\n",
104 |     "import json \n",
105 |     "\n",
106 |     "with open(\"tutorial.json\", \"w\") as outfile: \n",
107 |     "    json_string = json.dumps(settings_dict, indent=4)\n",
108 |     "    outfile.write(json_string)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Run scGHOST"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "import subprocess\n",
125 |     "\n",
126 |     "subprocess.call(['python scghost.py --config tutorial.json'],shell=True)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "### Format scGHOST output"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 2,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stderr",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "100%|██████████| 4238/4238 [01:01<00:00, 69.12it/s]\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "import pickle\n",
151 |     "import os\n",
152 |     "from tqdm import trange\n",
153 |     "\n",
154 |     "# enter labels.pkl path\n",
155 |     "label_filepath = '/mnt/e/data/scghost_pfc_output/publication_results/labels.pkl'\n",
156 |     "labels = pickle.load(open(label_filepath,'rb'))\n",
157 |     "\n",
158 |     "# enter cropped_indices.pkl path\n",
159 |     "cropped_indices_filepath = '/mnt/e/data/scghost_pfc_output/publication_results/cropped_indices.pkl'\n",
160 |     "cropped_indices = pickle.load(open(cropped_indices_filepath,'rb'))\n",
161 |     "\n",
162 |     "# enter resolution\n",
163 |     "resolution = 500000\n",
164 |     "\n",
165 |     "# enter bed file output directory\n",
166 |     "bed_file_directory = 'bed_files'\n",
167 |     "chrom_prefix = 'chr' # change this to '' if chromosomes are labeled chr1,chr2,... instead of 1,2,...\n",
168 |     "\n",
169 |     "sc_subcompartment_names = ['scA1','scA2','scB1','scB2','scB3'] # default for scGHOST k=5\n",
170 |     "\n",
171 |     "os.makedirs(bed_file_directory,exist_ok=True)\n",
172 |     "\n",
173 |     "num_cells = labels[ list( labels.keys() )[0] ].shape[0]\n",
174 |     "\n",
175 |     "for cell_num in trange(num_cells):\n",
176 |     "\n",
177 |     "    with open(os.path.join(bed_file_directory,f'cell_{cell_num}.bed'),'w') as f:\n",
178 |     "\n",
179 |     "        for chromosome in labels:\n",
180 |     "\n",
181 |     "            annotations = labels[chromosome][cell_num]\n",
182 |     "\n",
183 |     "            for locus in range(len(annotations)):\n",
184 |     "\n",
185 |     "                position = cropped_indices[chromosome][locus]\n",
186 |     "                annotation = sc_subcompartment_names[ annotations[locus] ]\n",
187 |     "\n",
188 |     "                line = f'{chrom_prefix}{chromosome}\\t{int(position * resolution)}\\t{int((position+1) * resolution)}\\t{annotation}\\n'\n",
189 |     "                f.write(line)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "### Generate scatter plot"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 1,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "name": "stderr",
206 |      "output_type": "stream",
207 |      "text": [
208 |       "/mnt/c/Users/turke/scghost_public/scGHOST/modules/analysis.py:45: RuntimeWarning: invalid value encountered in divide\n",
209 |       "  pseudo_bulk /= cov[None,:]\n",
210 |       "/mnt/c/Users/turke/scghost_public/scGHOST/modules/analysis.py:46: RuntimeWarning: invalid value encountered in divide\n",
211 |       "  pseudo_bulk /= cov[:,None]\n",
212 |       "100%|██████████| 4238/4238 [00:10<00:00, 420.16it/s]\n",
213 |       "100%|██████████| 4238/4238 [00:13<00:00, 323.12it/s]\n",
214 |       "100%|██████████| 4238/4238 [00:07<00:00, 530.87it/s]\n",
215 |       "100%|██████████| 4238/4238 [00:08<00:00, 473.63it/s]\n",
216 |       "100%|██████████| 4238/4238 [00:05<00:00, 731.00it/s]\n",
217 |       "100%|██████████| 4238/4238 [00:05<00:00, 790.55it/s]\n",
218 |       "100%|██████████| 4238/4238 [00:06<00:00, 684.44it/s]\n",
219 |       "100%|██████████| 4238/4238 [00:05<00:00, 733.62it/s] \n",
220 |       "100%|██████████| 4238/4238 [00:03<00:00, 1174.09it/s]\n",
221 |       "100%|██████████| 4238/4238 [00:05<00:00, 818.62it/s] \n",
222 |       "100%|██████████| 4238/4238 [00:04<00:00, 904.82it/s] \n",
223 |       "100%|██████████| 4238/4238 [00:04<00:00, 966.01it/s] \n",
224 |       "100%|██████████| 4238/4238 [00:02<00:00, 1833.86it/s]\n",
225 |       "100%|██████████| 4238/4238 [00:03<00:00, 1252.75it/s]\n",
226 |       "100%|██████████| 4238/4238 [00:01<00:00, 2557.56it/s]\n",
227 |       "100%|██████████| 4238/4238 [00:01<00:00, 2641.86it/s]\n",
228 |       "100%|██████████| 4238/4238 [00:01<00:00, 2954.00it/s]\n",
229 |       "100%|██████████| 4238/4238 [00:01<00:00, 3383.79it/s]\n",
230 |       "100%|██████████| 4238/4238 [00:00<00:00, 4811.81it/s]\n",
231 |       "100%|██████████| 4238/4238 [00:01<00:00, 4216.22it/s]\n",
232 |       "100%|██████████| 4238/4238 [00:00<00:00, 9060.52it/s]\n",
233 |       "100%|██████████| 4238/4238 [00:00<00:00, 10878.97it/s]\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "# sort chromosome subcompartments using single cell AB compartments from Higashi\n",
239 |     "from modules.analysis import prep_scatterplot\n",
240 |     "\n",
241 |     "prep_scatterplot(\n",
242 |     "    '/mnt/e/data/scghost_pfc_output/working_results/',\n",
243 |     "    '/mnt/e/data/scghost_pfc_output/chrom_indices.pkl',\n",
244 |     "    '/mnt/e/data/pfc/scAB.hdf5'\n",
245 |     ")"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 1,
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "name": "stderr",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
258 |       "  @numba.jit()\n",
259 |       "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
260 |       "  @numba.jit()\n",
261 |       "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
262 |       "  @numba.jit()\n",
263 |       "/home/kyle/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
264 |       "  from .autonotebook import tqdm as notebook_tqdm\n",
265 |       "/home/kyle/anaconda3/envs/dr/lib/python3.10/site-packages/umap/umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
266 |       "  @numba.jit()\n"
267 |      ]
268 |     },
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "['L2/3' 'L2/3' 'L2/3' ... 'L4' 'Astro' 'ODC']\n",
274 |       "<KeysViewHDF5 ['compartment']>\n"
275 |      ]
276 |     },
277 |     {
278 |      "name": "stderr",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "100%|██████████| 4238/4238 [00:04<00:00, 984.72it/s] \n"
282 |      ]
283 |     },
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "(4238, 5432)\n",
289 |       "(4238, 5432)\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "from modules.analysis import sc_compartment2embedding\n",
295 |     "import seaborn as sns\n",
296 |     "import matplotlib.pyplot as plt\n",
297 |     "\n",
298 |     "(vec,label,pal) = sc_compartment2embedding('./tutorial_embeds.hdf5','/mnt/e/data/pfc/','tutorial_scatter.pdf')\n",
299 |     "\n",
300 |     "fig, ax = plt.subplots(figsize=(7, 5))\n",
301 |     "sns.scatterplot(x=vec[:, 0], y=vec[:, 1], hue=label, linewidth=0, s=2, alpha=1.0, palette=pal)\n",
302 |     "\n",
303 |     "handles, labels = ax.get_legend_handles_labels()\n",
304 |     "labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))\n",
305 |     "ax.legend(handles=handles, labels=labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
306 |    ]
307 |   }
308 |  ],
309 |  "metadata": {
310 |   "kernelspec": {
311 |    "display_name": "dr",
312 |    "language": "python",
313 |    "name": "python3"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 3
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython3",
325 |    "version": "3.10.11"
326 |   }
327 |  },
328 |  "nbformat": 4,
329 |  "nbformat_minor": 2
330 | }
331 | 


--------------------------------------------------------------------------------
/data/hg38.chrom.sizes:
--------------------------------------------------------------------------------
  1 | chr1	248956422
  2 | chr2	242193529
  3 | chr3	198295559
  4 | chr4	190214555
  5 | chr5	181538259
  6 | chr6	170805979
  7 | chr7	159345973
  8 | chr8	145138636
  9 | chr9	138394717
 10 | chr10	133797422
 11 | chr11	135086622
 12 | chr12	133275309
 13 | chr13	114364328
 14 | chr14	107043718
 15 | chr15	101991189
 16 | chr16	90338345
 17 | chr17	83257441
 18 | chr18	80373285
 19 | chr19	58617616
 20 | chr20	64444167
 21 | chr21	46709983
 22 | chr22	50818468
 23 | chrX	156040895
 24 | chrY	57227415
 25 | chrM	16569
 26 | chr11_KI270721v1_random	100316
 27 | chr14_GL000009v2_random	201709
 28 | chr14_GL000225v1_random	211173
 29 | chr14_KI270722v1_random	194050
 30 | chr14_GL000194v1_random	191469
 31 | chr14_KI270723v1_random	38115
 32 | chr14_KI270724v1_random	39555
 33 | chr14_KI270725v1_random	172810
 34 | chr14_KI270726v1_random	43739
 35 | chr15_KI270727v1_random	448248
 36 | chr16_KI270728v1_random	1872759
 37 | chr17_GL000205v2_random	185591
 38 | chr17_KI270729v1_random	280839
 39 | chr17_KI270730v1_random	112551
 40 | chr1_KI270706v1_random	175055
 41 | chr1_KI270707v1_random	32032
 42 | chr1_KI270708v1_random	127682
 43 | chr1_KI270709v1_random	66860
 44 | chr1_KI270710v1_random	40176
 45 | chr1_KI270711v1_random	42210
 46 | chr1_KI270712v1_random	176043
 47 | chr1_KI270713v1_random	40745
 48 | chr1_KI270714v1_random	41717
 49 | chr22_KI270731v1_random	150754
 50 | chr22_KI270732v1_random	41543
 51 | chr22_KI270733v1_random	179772
 52 | chr22_KI270734v1_random	165050
 53 | chr22_KI270735v1_random	42811
 54 | chr22_KI270736v1_random	181920
 55 | chr22_KI270737v1_random	103838
 56 | chr22_KI270738v1_random	99375
 57 | chr22_KI270739v1_random	73985
 58 | chr2_KI270715v1_random	161471
 59 | chr2_KI270716v1_random	153799
 60 | chr3_GL000221v1_random	155397
 61 | chr4_GL000008v2_random	209709
 62 | chr5_GL000208v1_random	92689
 63 | chr9_KI270717v1_random	40062
 64 | chr9_KI270718v1_random	38054
 65 | chr9_KI270719v1_random	176845
 66 | chr9_KI270720v1_random	39050
 67 | chr1_KI270762v1_alt	354444
 68 | chr1_KI270766v1_alt	256271
 69 | chr1_KI270760v1_alt	109528
 70 | chr1_KI270765v1_alt	185285
 71 | chr1_GL383518v1_alt	182439
 72 | chr1_GL383519v1_alt	110268
 73 | chr1_GL383520v2_alt	366580
 74 | chr1_KI270764v1_alt	50258
 75 | chr1_KI270763v1_alt	911658
 76 | chr1_KI270759v1_alt	425601
 77 | chr1_KI270761v1_alt	165834
 78 | chr2_KI270770v1_alt	136240
 79 | chr2_KI270773v1_alt	70887
 80 | chr2_KI270774v1_alt	223625
 81 | chr2_KI270769v1_alt	120616
 82 | chr2_GL383521v1_alt	143390
 83 | chr2_KI270772v1_alt	133041
 84 | chr2_KI270775v1_alt	138019
 85 | chr2_KI270771v1_alt	110395
 86 | chr2_KI270768v1_alt	110099
 87 | chr2_GL582966v2_alt	96131
 88 | chr2_GL383522v1_alt	123821
 89 | chr2_KI270776v1_alt	174166
 90 | chr2_KI270767v1_alt	161578
 91 | chr3_JH636055v2_alt	173151
 92 | chr3_KI270783v1_alt	109187
 93 | chr3_KI270780v1_alt	224108
 94 | chr3_GL383526v1_alt	180671
 95 | chr3_KI270777v1_alt	173649
 96 | chr3_KI270778v1_alt	248252
 97 | chr3_KI270781v1_alt	113034
 98 | chr3_KI270779v1_alt	205312
 99 | chr3_KI270782v1_alt	162429
100 | chr3_KI270784v1_alt	184404
101 | chr4_KI270790v1_alt	220246
102 | chr4_GL383528v1_alt	376187
103 | chr4_KI270787v1_alt	111943
104 | chr4_GL000257v2_alt	586476
105 | chr4_KI270788v1_alt	158965
106 | chr4_GL383527v1_alt	164536
107 | chr4_KI270785v1_alt	119912
108 | chr4_KI270789v1_alt	205944
109 | chr4_KI270786v1_alt	244096
110 | chr5_KI270793v1_alt	126136
111 | chr5_KI270792v1_alt	179043
112 | chr5_KI270791v1_alt	195710
113 | chr5_GL383532v1_alt	82728
114 | chr5_GL949742v1_alt	226852
115 | chr5_KI270794v1_alt	164558
116 | chr5_GL339449v2_alt	1612928
117 | chr5_GL383530v1_alt	101241
118 | chr5_KI270796v1_alt	172708
119 | chr5_GL383531v1_alt	173459
120 | chr5_KI270795v1_alt	131892
121 | chr6_GL000250v2_alt	4672374
122 | chr6_KI270800v1_alt	175808
123 | chr6_KI270799v1_alt	152148
124 | chr6_GL383533v1_alt	124736
125 | chr6_KI270801v1_alt	870480
126 | chr6_KI270802v1_alt	75005
127 | chr6_KB021644v2_alt	185823
128 | chr6_KI270797v1_alt	197536
129 | chr6_KI270798v1_alt	271782
130 | chr7_KI270804v1_alt	157952
131 | chr7_KI270809v1_alt	209586
132 | chr7_KI270806v1_alt	158166
133 | chr7_GL383534v2_alt	119183
134 | chr7_KI270803v1_alt	1111570
135 | chr7_KI270808v1_alt	271455
136 | chr7_KI270807v1_alt	126434
137 | chr7_KI270805v1_alt	209988
138 | chr8_KI270818v1_alt	145606
139 | chr8_KI270812v1_alt	282736
140 | chr8_KI270811v1_alt	292436
141 | chr8_KI270821v1_alt	985506
142 | chr8_KI270813v1_alt	300230
143 | chr8_KI270822v1_alt	624492
144 | chr8_KI270814v1_alt	141812
145 | chr8_KI270810v1_alt	374415
146 | chr8_KI270819v1_alt	133535
147 | chr8_KI270820v1_alt	36640
148 | chr8_KI270817v1_alt	158983
149 | chr8_KI270816v1_alt	305841
150 | chr8_KI270815v1_alt	132244
151 | chr9_GL383539v1_alt	162988
152 | chr9_GL383540v1_alt	71551
153 | chr9_GL383541v1_alt	171286
154 | chr9_GL383542v1_alt	60032
155 | chr9_KI270823v1_alt	439082
156 | chr10_GL383545v1_alt	179254
157 | chr10_KI270824v1_alt	181496
158 | chr10_GL383546v1_alt	309802
159 | chr10_KI270825v1_alt	188315
160 | chr11_KI270832v1_alt	210133
161 | chr11_KI270830v1_alt	177092
162 | chr11_KI270831v1_alt	296895
163 | chr11_KI270829v1_alt	204059
164 | chr11_GL383547v1_alt	154407
165 | chr11_JH159136v1_alt	200998
166 | chr11_JH159137v1_alt	191409
167 | chr11_KI270827v1_alt	67707
168 | chr11_KI270826v1_alt	186169
169 | chr12_GL877875v1_alt	167313
170 | chr12_GL877876v1_alt	408271
171 | chr12_KI270837v1_alt	40090
172 | chr12_GL383549v1_alt	120804
173 | chr12_KI270835v1_alt	238139
174 | chr12_GL383550v2_alt	169178
175 | chr12_GL383552v1_alt	138655
176 | chr12_GL383553v2_alt	152874
177 | chr12_KI270834v1_alt	119498
178 | chr12_GL383551v1_alt	184319
179 | chr12_KI270833v1_alt	76061
180 | chr12_KI270836v1_alt	56134
181 | chr13_KI270840v1_alt	191684
182 | chr13_KI270839v1_alt	180306
183 | chr13_KI270843v1_alt	103832
184 | chr13_KI270841v1_alt	169134
185 | chr13_KI270838v1_alt	306913
186 | chr13_KI270842v1_alt	37287
187 | chr14_KI270844v1_alt	322166
188 | chr14_KI270847v1_alt	1511111
189 | chr14_KI270845v1_alt	180703
190 | chr14_KI270846v1_alt	1351393
191 | chr15_KI270852v1_alt	478999
192 | chr15_KI270851v1_alt	263054
193 | chr15_KI270848v1_alt	327382
194 | chr15_GL383554v1_alt	296527
195 | chr15_KI270849v1_alt	244917
196 | chr15_GL383555v2_alt	388773
197 | chr15_KI270850v1_alt	430880
198 | chr16_KI270854v1_alt	134193
199 | chr16_KI270856v1_alt	63982
200 | chr16_KI270855v1_alt	232857
201 | chr16_KI270853v1_alt	2659700
202 | chr16_GL383556v1_alt	192462
203 | chr16_GL383557v1_alt	89672
204 | chr17_GL383563v3_alt	375691
205 | chr17_KI270862v1_alt	391357
206 | chr17_KI270861v1_alt	196688
207 | chr17_KI270857v1_alt	2877074
208 | chr17_JH159146v1_alt	278131
209 | chr17_JH159147v1_alt	70345
210 | chr17_GL383564v2_alt	133151
211 | chr17_GL000258v2_alt	1821992
212 | chr17_GL383565v1_alt	223995
213 | chr17_KI270858v1_alt	235827
214 | chr17_KI270859v1_alt	108763
215 | chr17_GL383566v1_alt	90219
216 | chr17_KI270860v1_alt	178921
217 | chr18_KI270864v1_alt	111737
218 | chr18_GL383567v1_alt	289831
219 | chr18_GL383570v1_alt	164789
220 | chr18_GL383571v1_alt	198278
221 | chr18_GL383568v1_alt	104552
222 | chr18_GL383569v1_alt	167950
223 | chr18_GL383572v1_alt	159547
224 | chr18_KI270863v1_alt	167999
225 | chr19_KI270868v1_alt	61734
226 | chr19_KI270865v1_alt	52969
227 | chr19_GL383573v1_alt	385657
228 | chr19_GL383575v2_alt	170222
229 | chr19_GL383576v1_alt	188024
230 | chr19_GL383574v1_alt	155864
231 | chr19_KI270866v1_alt	43156
232 | chr19_KI270867v1_alt	233762
233 | chr19_GL949746v1_alt	987716
234 | chr20_GL383577v2_alt	128386
235 | chr20_KI270869v1_alt	118774
236 | chr20_KI270871v1_alt	58661
237 | chr20_KI270870v1_alt	183433
238 | chr21_GL383578v2_alt	63917
239 | chr21_KI270874v1_alt	166743
240 | chr21_KI270873v1_alt	143900
241 | chr21_GL383579v2_alt	201197
242 | chr21_GL383580v2_alt	74653
243 | chr21_GL383581v2_alt	116689
244 | chr21_KI270872v1_alt	82692
245 | chr22_KI270875v1_alt	259914
246 | chr22_KI270878v1_alt	186262
247 | chr22_KI270879v1_alt	304135
248 | chr22_KI270876v1_alt	263666
249 | chr22_KI270877v1_alt	101331
250 | chr22_GL383583v2_alt	96924
251 | chr22_GL383582v2_alt	162811
252 | chrX_KI270880v1_alt	284869
253 | chrX_KI270881v1_alt	144206
254 | chr19_KI270882v1_alt	248807
255 | chr19_KI270883v1_alt	170399
256 | chr19_KI270884v1_alt	157053
257 | chr19_KI270885v1_alt	171027
258 | chr19_KI270886v1_alt	204239
259 | chr19_KI270887v1_alt	209512
260 | chr19_KI270888v1_alt	155532
261 | chr19_KI270889v1_alt	170698
262 | chr19_KI270890v1_alt	184499
263 | chr19_KI270891v1_alt	170680
264 | chr1_KI270892v1_alt	162212
265 | chr2_KI270894v1_alt	214158
266 | chr2_KI270893v1_alt	161218
267 | chr3_KI270895v1_alt	162896
268 | chr4_KI270896v1_alt	378547
269 | chr5_KI270897v1_alt	1144418
270 | chr5_KI270898v1_alt	130957
271 | chr6_GL000251v2_alt	4795265
272 | chr7_KI270899v1_alt	190869
273 | chr8_KI270901v1_alt	136959
274 | chr8_KI270900v1_alt	318687
275 | chr11_KI270902v1_alt	106711
276 | chr11_KI270903v1_alt	214625
277 | chr12_KI270904v1_alt	572349
278 | chr15_KI270906v1_alt	196384
279 | chr15_KI270905v1_alt	5161414
280 | chr17_KI270907v1_alt	137721
281 | chr17_KI270910v1_alt	157099
282 | chr17_KI270909v1_alt	325800
283 | chr17_JH159148v1_alt	88070
284 | chr17_KI270908v1_alt	1423190
285 | chr18_KI270912v1_alt	174061
286 | chr18_KI270911v1_alt	157710
287 | chr19_GL949747v2_alt	729520
288 | chr22_KB663609v1_alt	74013
289 | chrX_KI270913v1_alt	274009
290 | chr19_KI270914v1_alt	205194
291 | chr19_KI270915v1_alt	170665
292 | chr19_KI270916v1_alt	184516
293 | chr19_KI270917v1_alt	190932
294 | chr19_KI270918v1_alt	123111
295 | chr19_KI270919v1_alt	170701
296 | chr19_KI270920v1_alt	198005
297 | chr19_KI270921v1_alt	282224
298 | chr19_KI270922v1_alt	187935
299 | chr19_KI270923v1_alt	189352
300 | chr3_KI270924v1_alt	166540
301 | chr4_KI270925v1_alt	555799
302 | chr6_GL000252v2_alt	4604811
303 | chr8_KI270926v1_alt	229282
304 | chr11_KI270927v1_alt	218612
305 | chr19_GL949748v2_alt	1064304
306 | chr22_KI270928v1_alt	176103
307 | chr19_KI270929v1_alt	186203
308 | chr19_KI270930v1_alt	200773
309 | chr19_KI270931v1_alt	170148
310 | chr19_KI270932v1_alt	215732
311 | chr19_KI270933v1_alt	170537
312 | chr19_GL000209v2_alt	177381
313 | chr3_KI270934v1_alt	163458
314 | chr6_GL000253v2_alt	4677643
315 | chr19_GL949749v2_alt	1091841
316 | chr3_KI270935v1_alt	197351
317 | chr6_GL000254v2_alt	4827813
318 | chr19_GL949750v2_alt	1066390
319 | chr3_KI270936v1_alt	164170
320 | chr6_GL000255v2_alt	4606388
321 | chr19_GL949751v2_alt	1002683
322 | chr3_KI270937v1_alt	165607
323 | chr6_GL000256v2_alt	4929269
324 | chr19_GL949752v1_alt	987100
325 | chr6_KI270758v1_alt	76752
326 | chr19_GL949753v2_alt	796479
327 | chr19_KI270938v1_alt	1066800
328 | chrUn_KI270302v1	2274
329 | chrUn_KI270304v1	2165
330 | chrUn_KI270303v1	1942
331 | chrUn_KI270305v1	1472
332 | chrUn_KI270322v1	21476
333 | chrUn_KI270320v1	4416
334 | chrUn_KI270310v1	1201
335 | chrUn_KI270316v1	1444
336 | chrUn_KI270315v1	2276
337 | chrUn_KI270312v1	998
338 | chrUn_KI270311v1	12399
339 | chrUn_KI270317v1	37690
340 | chrUn_KI270412v1	1179
341 | chrUn_KI270411v1	2646
342 | chrUn_KI270414v1	2489
343 | chrUn_KI270419v1	1029
344 | chrUn_KI270418v1	2145
345 | chrUn_KI270420v1	2321
346 | chrUn_KI270424v1	2140
347 | chrUn_KI270417v1	2043
348 | chrUn_KI270422v1	1445
349 | chrUn_KI270423v1	981
350 | chrUn_KI270425v1	1884
351 | chrUn_KI270429v1	1361
352 | chrUn_KI270442v1	392061
353 | chrUn_KI270466v1	1233
354 | chrUn_KI270465v1	1774
355 | chrUn_KI270467v1	3920
356 | chrUn_KI270435v1	92983
357 | chrUn_KI270438v1	112505
358 | chrUn_KI270468v1	4055
359 | chrUn_KI270510v1	2415
360 | chrUn_KI270509v1	2318
361 | chrUn_KI270518v1	2186
362 | chrUn_KI270508v1	1951
363 | chrUn_KI270516v1	1300
364 | chrUn_KI270512v1	22689
365 | chrUn_KI270519v1	138126
366 | chrUn_KI270522v1	5674
367 | chrUn_KI270511v1	8127
368 | chrUn_KI270515v1	6361
369 | chrUn_KI270507v1	5353
370 | chrUn_KI270517v1	3253
371 | chrUn_KI270529v1	1899
372 | chrUn_KI270528v1	2983
373 | chrUn_KI270530v1	2168
374 | chrUn_KI270539v1	993
375 | chrUn_KI270538v1	91309
376 | chrUn_KI270544v1	1202
377 | chrUn_KI270548v1	1599
378 | chrUn_KI270583v1	1400
379 | chrUn_KI270587v1	2969
380 | chrUn_KI270580v1	1553
381 | chrUn_KI270581v1	7046
382 | chrUn_KI270579v1	31033
383 | chrUn_KI270589v1	44474
384 | chrUn_KI270590v1	4685
385 | chrUn_KI270584v1	4513
386 | chrUn_KI270582v1	6504
387 | chrUn_KI270588v1	6158
388 | chrUn_KI270593v1	3041
389 | chrUn_KI270591v1	5796
390 | chrUn_KI270330v1	1652
391 | chrUn_KI270329v1	1040
392 | chrUn_KI270334v1	1368
393 | chrUn_KI270333v1	2699
394 | chrUn_KI270335v1	1048
395 | chrUn_KI270338v1	1428
396 | chrUn_KI270340v1	1428
397 | chrUn_KI270336v1	1026
398 | chrUn_KI270337v1	1121
399 | chrUn_KI270363v1	1803
400 | chrUn_KI270364v1	2855
401 | chrUn_KI270362v1	3530
402 | chrUn_KI270366v1	8320
403 | chrUn_KI270378v1	1048
404 | chrUn_KI270379v1	1045
405 | chrUn_KI270389v1	1298
406 | chrUn_KI270390v1	2387
407 | chrUn_KI270387v1	1537
408 | chrUn_KI270395v1	1143
409 | chrUn_KI270396v1	1880
410 | chrUn_KI270388v1	1216
411 | chrUn_KI270394v1	970
412 | chrUn_KI270386v1	1788
413 | chrUn_KI270391v1	1484
414 | chrUn_KI270383v1	1750
415 | chrUn_KI270393v1	1308
416 | chrUn_KI270384v1	1658
417 | chrUn_KI270392v1	971
418 | chrUn_KI270381v1	1930
419 | chrUn_KI270385v1	990
420 | chrUn_KI270382v1	4215
421 | chrUn_KI270376v1	1136
422 | chrUn_KI270374v1	2656
423 | chrUn_KI270372v1	1650
424 | chrUn_KI270373v1	1451
425 | chrUn_KI270375v1	2378
426 | chrUn_KI270371v1	2805
427 | chrUn_KI270448v1	7992
428 | chrUn_KI270521v1	7642
429 | chrUn_GL000195v1	182896
430 | chrUn_GL000219v1	179198
431 | chrUn_GL000220v1	161802
432 | chrUn_GL000224v1	179693
433 | chrUn_KI270741v1	157432
434 | chrUn_GL000226v1	15008
435 | chrUn_GL000213v1	164239
436 | chrUn_KI270743v1	210658
437 | chrUn_KI270744v1	168472
438 | chrUn_KI270745v1	41891
439 | chrUn_KI270746v1	66486
440 | chrUn_KI270747v1	198735
441 | chrUn_KI270748v1	93321
442 | chrUn_KI270749v1	158759
443 | chrUn_KI270750v1	148850
444 | chrUn_KI270751v1	150742
445 | chrUn_KI270752v1	27745
446 | chrUn_KI270753v1	62944
447 | chrUn_KI270754v1	40191
448 | chrUn_KI270755v1	36723
449 | chrUn_KI270756v1	79590
450 | chrUn_KI270757v1	71251
451 | chrUn_GL000214v1	137718
452 | chrUn_KI270742v1	186739
453 | chrUn_GL000216v2	176608
454 | chrUn_GL000218v1	161147
455 | chrY_KI270740v1_random	37240
456 | 


--------------------------------------------------------------------------------