├── .gitignore
├── LICENSE
├── README.md
├── pyproject.toml
└── qtl
    ├── __init__.py
    ├── annotation.py
    ├── coloc.py
    ├── core.py
    ├── genotype.py
    ├── gtex.py
    ├── io.py
    ├── locusplot.py
    ├── map.py
    ├── norm.py
    ├── pca.py
    ├── pileup.py
    ├── plot.py
    ├── sam.py
    ├── stats.py
    └── torus.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.egg-info/
3 | *.ipynb_checkpoints/
4 | .DS_Store
5 | __*__
6 | build/
7 | dist/
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, The Broad Institute, Inc. and The General Hospital Corporation.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## pyQTL
 2 | 
 3 | pyQTL is a python module for analyzing and visualizing quantitative trait loci (QTL) data.
 4 | 
 5 | The following functionalities are provided:
 6 | * `qtl.annotation`: class for working with gene annotations; includes a [GTF](https://www.gencodegenes.org/pages/data_format.html) parser.
 7 | * `qtl.coloc`: Python implementation of core functions from the [R COLOC package](https://github.com/chr1swallace/coloc).
 8 | * `qtl.io`: functions for reading/writing BED and GCT files.
 9 | * `qtl.locusplot`: functions for generating LocusZoom-style regional association plots.
10 | * `qtl.pileup`: functions for visualizing QTL effects in read pileups from, e.g., RNA-seq data.
11 | * `qtl.plot`: plotting functions for QTLs.
12 | 
13 | ### Install
14 | You can install pyQTL using pip:
15 | ```
16 | pip3 install qtl
17 | ```
18 | or directly from this repository:
19 | ```
20 | $ git clone git@github.com:broadinstitute/pyqtl.git
21 | $ cd pyqtl
22 | # set up virtual environment and install
23 | $ virtualenv venv
24 | $ source venv/bin/activate
25 | (venv)$ pip install -e .
26 | ```
27 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "qtl"
 7 | version = "0.1.10"
 8 | dependencies = [
 9 |     "biopython",
10 |     "bx-python",
11 |     "pyBigWig",
12 |     "matplotlib",
13 |     "numpy",
14 |     "pandas",
15 |     "scipy",
16 |     "seaborn",
17 | ]
18 | authors = [
19 |     {name = "Francois Aguet", email = "francois@broadinstitute.org"}
20 | ]
21 | maintainers = [
22 |     {name = "Francois Aguet", email = "francois@broadinstitute.org"}
23 | ]
24 | description = "Utilities for analyzing and visualizing QTL data"
25 | readme = "README.md"
26 | license = {file = "LICENSE"}
27 | keywords = ["Quantitative trait loci"]
28 | classifiers = [
29 |     "Development Status :: 4 - Beta",
30 |     "Programming Language :: Python :: 3",
31 |     "Intended Audience :: Science/Research",
32 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
33 | ]
34 | 
35 | [project.urls]
36 | Repository = "https://github.com/broadinstitute/pyqtl.git"
37 | 


--------------------------------------------------------------------------------
/qtl/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | __version__ = importlib.metadata.version(__name__)
3 | from .core import *
4 | 


--------------------------------------------------------------------------------
/qtl/coloc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy.stats as stats
  4 | from statsmodels.formula.api import ols
  5 | import itertools
  6 | 
  7 | # Code adapted from
  8 | # https://github.com/chr1swallace/coloc/blob/master/R/claudia.R
  9 | 
 10 | 
 11 | def var_data(f, N):
 12 |     """
 13 |     Variance of MLE of beta for quantitative trait, assuming var(y) = 1
 14 | 
 15 |     Args:
 16 |       f: minor allele freq
 17 |       N: sample size
 18 | 
 19 |     Returns:
 20 |       variance of MLE beta
 21 |     """
 22 |     return 1 / (2 * N * f * (1 - f))
 23 | 
 24 | 
 25 | def var_data_cc(f, N, s):
 26 |     """
 27 |     Variance of MLE of beta for case-control
 28 | 
 29 |     Args:
 30 |       f: minor allele freq
 31 |       N: sample size
 32 |       s: proportion of samples that are cases
 33 | 
 34 |     Returns:
 35 |       variance of MLE beta
 36 |     """
 37 |     return 1 / (2 * N * f * (1 - f) * s * (1 - s))
 38 | 
 39 | 
 40 | def logsum(x):
 41 |     """Computes log(sum(ABF)), where x = log(ABF)"""
 42 |     mmax = np.max(x)
 43 |     return mmax + np.log(np.sum(np.exp(x-mmax)))
 44 | 
 45 | 
 46 | def logdiff(x, y):
 47 |     """"""
 48 |     mmax = np.maximum(np.max(x), np.max(y))
 49 |     return mmax + np.log(np.exp(x - mmax) - np.exp(y - mmax))
 50 | 
 51 | 
 52 | def approx_bf_p(p, f, N, s=None, type='quant'):
 53 |     """
 54 |     Calculate approximate Bayes Factors
 55 | 
 56 |     Args:
 57 |       p: p-value
 58 |       f: minor allele frequency
 59 |       N: sample size
 60 |       s: proportion of samples that are cases
 61 |       type: 'quant' or 'cc'
 62 | 
 63 |     Returns:
 64 |       Data frame with lABF and intermediate calculations
 65 |     """
 66 |     if type == 'quant':
 67 |         sd_prior = 0.15
 68 |         v = var_data(f, N)
 69 |     else:
 70 |         sd_prior = 0.2
 71 |         v = var_data_cc(f, N, s)
 72 |     z = stats.norm.isf(0.5 * p)
 73 |     # shrinkage factor: ratio of the prior variance to the total variance
 74 |     r = sd_prior**2 / (sd_prior**2 + v)
 75 |     # Approximate BF  # I want ln scale to compare in log natural scale with LR diff
 76 |     labf = 0.5 * (np.log(1 - r) + r*z*z)
 77 |     return pd.DataFrame({'v':v, 'z':z, 'r':r, 'lABF':labf})
 78 | 
 79 | 
 80 | def approx_bf_estimates(z, v, type='quant', sdy=1):
 81 |     """
 82 |     Calculates approximate Bayes Factors using the variance of the regression coefficients
 83 | 
 84 |     See eq. (2) in Wakefield, 2009 and Supplementary methods from Giambartolomei et al., 2014.
 85 | 
 86 |     Args:
 87 |       z: normal deviate associated with regression coefficient and its variance (in effect the t-statistic, beta/beta_se)
 88 |       v: variance of the regression coefficient (beta_se**2)
 89 |       sdy: standard deviation of the trait
 90 | 
 91 |     Returns:
 92 |       Data frame with lABF and intermediate calculations
 93 |     """
 94 |     if type == 'quant':
 95 |         sd_prior = 0.15*sdy
 96 |     else:
 97 |         sd_prior = 0.2
 98 |     r = sd_prior**2 / (sd_prior**2 + v)
 99 |     labf = 0.5 * (np.log(1 - r) + r*z*z)
100 |     return pd.DataFrame({'v':v, 'z':z, 'r':r, 'lABF':labf})
101 | 
102 | 
103 | def combine_abf(l1, l2, p1=1e-4, p2=1e-4, p12=1e-5, verbose=False):
104 |     """
105 |     Calculate posterior probabilities for configurations, given logABFs for each SNP and prior probabilities
106 | 
107 |     Args:
108 |       l1:  logABFs for trait 1
109 |       l2:  logABFs for trait 2
110 |       p1:  prior probability a SNP is associated with trait 1, default 1e-4
111 |       p2:  prior probability a SNP is associated with trait 2, default 1e-4
112 |       p12: p12 prior probability a SNP is associated with both traits, default 1e-5
113 | 
114 |     Returns:
115 |       pd.Series of posterior probabilities
116 |     """
117 |     lsum = l1 + l2
118 |     lh0_abf = 0
119 |     lh1_abf = np.log(p1) + logsum(l1)
120 |     lh2_abf = np.log(p2) + logsum(l2)
121 |     lh3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum))
122 |     lh4_abf = np.log(p12) + logsum(lsum)
123 |     all_abf = [lh0_abf, lh1_abf, lh2_abf, lh3_abf, lh4_abf]
124 |     my_denom_log_abf = logsum(all_abf)  # denominator in eq. 2
125 |     pp_abf = np.exp(all_abf - my_denom_log_abf)
126 |     pp_abf = pd.Series(pp_abf, index=[f'pp_h{i}_abf' for i in range(5)])
127 |     if verbose:
128 |         print(pp_abf)
129 |         print(f"PP abf for shared variant: {pp_abf['pp_h4_abf']*100:.3f}%")
130 |     return pp_abf
131 | 
132 | 
133 | def process_dataset(df, N=None, sdy=None, type='quant'):
134 |     """
135 |     Preprocessing steps, including calculation of approximate Bayes Factors
136 | 
137 |     Args:
138 |       df: data frame with columns: beta, beta_se -or- pval, maf
139 |       N: sample size
140 |       sdy: standard deviation of the trait. Estimated from the beta_se and MAF if not provided.
141 | 
142 |     Returns a data frame with the additional columns:
143 |       v (beta_se**2), z (z-score), r (shrinkage factor), lABF (log ABF)
144 |     """
145 |     if 'beta' in df and 'beta_se' in df:
146 |         beta_var = df['beta_se']**2
147 |         if sdy is None:
148 |             print('WARNING: estimating sdy from the data')
149 |             sdy = sdy_est(beta_var, df['maf'], N)
150 |         res_df = approx_bf_estimates(df['beta']/df['beta_se'], beta_var, type=type, sdy=sdy)
151 |     else:
152 |         pval_col = df.columns[df.columns.str.startswith('pval')][0]
153 |         res_df = approx_bf_p(df[pval_col], df['maf'], type=type, N=N, s=None)
154 |     return df.join(res_df)
155 | 
156 | 
157 | def sdy_est(vbeta, maf, n):
158 |     """
159 |     Estimate trait standard deviation given vectors of variance of coefficients,  MAF and sample size
160 | 
161 |       Estimate is based on var(beta-hat) = var(Y) / (n * var(X))
162 |       var(X) = 2*maf*(1-maf)
163 |       so we can estimate var(Y) by regressing n*var(X) against 1/var(beta)
164 | 
165 |     Args:
166 |       vbeta: vector of variance of coefficients
167 |       maf: vector of MAF (same length as vbeta)
168 |       n: sample size
169 | 
170 |     Returns:
171 |       estimated standard deviation of Y
172 |     """
173 |     print('Warning: estimating sdY from MAF and varbeta, provide this if known.')
174 |     oneover = 1/vbeta
175 |     nvx = 2 * n * maf * (1-maf)  # n * var(X)
176 |     res = ols('nvx ~ oneover - 1', {'nvx':nvx, 'oneover':oneover}).fit()
177 |     cf = res.params[0]
178 |     return np.sqrt(cf)
179 | 
180 | 
181 | def abf(df1, df2, N=None, sdy=None, p1=1e-4, p2=1e-4, p12=1e-5, verbose=False):
182 |     """
183 | 
184 |     Args:
185 |       df1, df2: DataFrames with columns
186 |                   'beta' and 'beta_se' -or-
187 |                   'pval_nominal' and 'maf'
188 |       N: sample size, must be provided if using p-values and MAF
189 | 
190 |     """
191 | 
192 |     if 'sample_size' in df1:
193 |         n1 = int(df1['sample_size'].values[0])
194 |     else:
195 |         assert N is not None
196 |         n1 = N
197 | 
198 |     if 'sample_size' in df2:
199 |         n2 = int(df2['sample_size'].values[0])
200 |     else:
201 |         assert N is not None
202 |         n2 = N
203 | 
204 |     if 'p_std' in df1:
205 |         sdy1 = float(df1['p_std'].values[0])
206 |     else:
207 |         sdy1 = sdy
208 |     if 'p_std' in df2:
209 |         sdy2 = float(df2['p_std'].values[0])
210 |     else:
211 |         sdy2 = sdy
212 |     mdf1 = process_dataset(df1, N=n1, sdy=sdy1)
213 |     mdf2 = process_dataset(df2, N=n2, sdy=sdy2)
214 | 
215 |     merged_df = pd.merge(mdf1.reset_index(drop=True), mdf2.reset_index(drop=True), suffixes=('_1', '_2'), left_index=True, right_index=True)
216 |     # merged_df = merged_df.sort_values('snp_1')
217 |     internal_sum_lABF = merged_df['lABF_1'] + merged_df['lABF_2']
218 |     merged_df['internal_sum_lABF'] = internal_sum_lABF
219 |     my_denom_log_abf = logsum(internal_sum_lABF)
220 |     merged_df['snp_pp_h4'] = np.exp(internal_sum_lABF - my_denom_log_abf)
221 |     pp_abf = combine_abf(mdf1['lABF'], mdf2['lABF'], p1=p1, p2=p2, p12=p12, verbose=verbose)
222 |     return pp_abf, merged_df
223 | 
224 | 
225 | def susie(s1, s2, p1=1e-4, p2=1e-4, p12=5e-6, verbose=False, is_sorted=True):
226 |     """
227 |     Colocalisation with multiple causal variants using SuSiE
228 | 
229 |     s1, s2: outputs from SuSiE
230 | 
231 |     Note: this function assumes that 'lbf_variable' are indexed by 'cs_index':
232 |       res['lbf_variable'] = res['lbf_variable'][res['sets']['cs_index']]
233 |     See tensorqtl.susie.map() for additional details.
234 | 
235 |     """
236 |     cs1 = s1['sets']
237 |     cs2 = s2['sets']
238 |     lbf1 = s1['lbf_variable']
239 |     lbf2 = s2['lbf_variable']
240 |     if not isinstance(lbf1, pd.DataFrame):
241 |         lbf1 = pd.DataFrame(lbf1, columns=s1['pip'].index)
242 |     if not isinstance(lbf2, pd.DataFrame):
243 |         lbf2 = pd.DataFrame(lbf2, columns=s2['pip'].index)
244 |     isnps = lbf1.columns[lbf1.columns.isin(lbf2.columns)]
245 |     n = len(isnps)
246 |     if cs1['cs'] is None or cs2['cs'] is None or len(cs1['cs']) == 0 or len(cs2['cs']) == 0 or n == 0:
247 |         return None
248 |     if verbose:
249 |         print(f"Using {n} shared variants (of {lbf1.shape[1]} and {lbf2.shape[1]})")
250 |     idx1 = cs1['cs_index']
251 |     idx2 = cs2['cs_index']
252 |     if not is_sorted:
253 |         bf1 = lbf1.loc[idx1, isnps]
254 |         bf2 = lbf2.loc[idx2, isnps]
255 |     else:
256 |         bf1 = lbf1[isnps]
257 |         bf2 = lbf2[isnps]
258 | 
259 |     ret = bf_bf(bf1, bf2, p1=p1, p2=p2, p12=p12)
260 | 
261 |     ret['summary']['idx1'] = idx1[ret['summary']['idx1']]
262 |     ret['summary']['idx2'] = idx2[ret['summary']['idx2']]
263 |     # ret$summary[, `:=`(idx1, cs1$cs_index[idx1])]
264 |     # ret$summary[, `:=`(idx2, cs2$cs_index[idx2])]
265 |     return ret
266 | 
267 | 
268 | def bf_bf(bf1, bf2, p1=1e-4, p2=1e-4, p12=5e-6, overlap_min=0.5, trim_by_posterior=True, verbose=False):
269 |     """Colocalize two datasets represented by Bayes factors"""
270 |     if isinstance(bf1, pd.Series):
271 |         bf1 = bf1.to_frame().T
272 |     if isinstance(bf2, pd.Series):
273 |         bf2 = bf2.to_frame().T
274 | 
275 |     # combinations to test
276 |     todo_df = pd.DataFrame(itertools.product(range(len(bf1)), range(len(bf2))), columns=['i', 'j'])
277 |     todo_df['pp4'] = 0
278 | 
279 |     isnps = bf1.columns[bf1.columns.isin(bf2.columns)]
280 |     if len(isnps) == 0:
281 |         return None
282 | 
283 |     pp1 = logbf_to_pp(bf1, p1, last_is_null=True)
284 |     pp2 = logbf_to_pp(bf2, p2, last_is_null=True)
285 |     ph0_1 = 1 - np.sum(pp1, 1)
286 |     ph0_2 = 1 - np.sum(pp2, 1)
287 | 
288 |     prop1 = pp1[isnps].sum(1) / pp1.sum(1)
289 |     prop2 = pp2[isnps].sum(1) / pp2.sum(1)
290 | 
291 |     if trim_by_posterior:
292 |         # drop combinations with insufficient overlapping variants
293 |         drop = (prop1.values[todo_df['i']] < overlap_min) | (prop2.values[todo_df['j']] < overlap_min)
294 |         if all(drop):
295 |             print("WARNING: snp overlap too small between datasets: too few snps with high posterior in one trait represented in other")
296 |             return None
297 | #             return(list(summary = cbind(data.table(nsnps = length(isnps),
298 | #                 hit1 = colnames(pp1)[apply(pp1, 1, which.max)][todo$i],
299 | #                 hit2 = colnames(pp2)[apply(pp2, 1, which.max)][todo$j],
300 | #                 PP.H0.abf = pmin(ph0.1[todo$i], ph0.2[todo$j]),
301 | #                 PP.H1.abf = NA, PP.H2.abf = NA, PP.H3.abf = NA,
302 | #                 PP.H4.abf = NA), todo[, .(idx1 = i, idx2 = j)])))
303 |         elif any(drop):
304 |             todo_df = todo_df[~drop]
305 | 
306 |     bf1 = bf1[isnps]
307 |     bf2 = bf2[isnps]
308 | 
309 |     results = []
310 |     PP = []
311 |     for k in range(len(todo_df)):
312 |         df = pd.DataFrame({'snp': isnps, 'bf1': bf1.values[todo_df['i'][k]].astype(np.float64),
313 |                            'bf2': bf2.values[todo_df['j'][k]].astype(np.float64)})
314 |         df['internal_sum_lABF'] = df['bf1'] + df['bf2']
315 |         df['snp_pp_h4'] =  np.exp(df['internal_sum_lABF'] - logsum(df['internal_sum_lABF']))
316 |         pp_abf = combine_abf(df['bf1'], df['bf2'], p1, p2, p12, verbose=verbose)
317 | 
318 |         PP.append(df['snp_pp_h4'])
319 |         if df['snp_pp_h4'].isnull().all():
320 |             df['snp_pp_h4'] = 0
321 |             pp_abf = pd.Series([1, 0, 0, 0, 0], index=pp_abf.index, dtype=np.float64)
322 |         hit1 = bf1.columns[np.argmax(bf1.values[todo_df['i'][k]])]
323 |         # if (is.null(hit1)) {
324 |         #     hit1 = "-"
325 |         #     pp.abf[c(1, 3)] = c(0, 1)
326 |         # }
327 |         hit2 = bf2.columns[np.argmax(bf2.values[todo_df['j'][k]])]
328 |         # if (is.null(hit2)) {
329 |         #     hit2 = "-"
330 |         #     pp.abf[c(1, 2)] = c(0, 1)
331 |         # }
332 |         results.append([df.shape[0], hit1, hit2] + pp_abf.tolist())
333 |     results = pd.DataFrame(results, columns=['nsnps', 'hit1', 'hit2'] + pp_abf.index.tolist())
334 |     results = pd.concat([results, todo_df[['i','j']].rename(columns={'i':'idx1', 'j':'idx2'})], axis=1)
335 |     PP = pd.DataFrame(PP).T
336 |     if len(todo_df) > 1:
337 |         PP.columns = [f"snp_pp_h4_row{i}" for i in range(len(todo_df))]
338 |     else:
339 |         PP.columns = ["snp_pp_h4_abf"]
340 | 
341 |     m = results[['hit1', 'hit2']].duplicated()
342 |     if any(m):
343 |         results = results[~m]
344 |         PP = PP[PP.columns[~m]]
345 | 
346 |     PP = pd.concat([pd.Series(isnps, name='snp'), PP], axis=1)
347 |     return {'summary': results, 'results': PP, 'priors': pd.Series({'p1':p1, 'p2':p2, 'p12':p12})}
348 | 
349 | 
350 | def logbf_to_pp(bf, pi, last_is_null=True):
351 |     """
352 |     Convert logBF matrix to PP matrix
353 |     bf: Bayes Factors --- L by p or p+1 matrix?
354 |     pi: prior probability
355 |     last_is_null: True if the last value of the BF matrix corresponds to the null hypythesis of no associations.
356 |     """
357 |     if isinstance(bf, pd.DataFrame):
358 |         cols = bf.columns
359 |         index = bf.index
360 |         bf = bf.values.copy()
361 |     else:
362 |         cols = None
363 |         bf = bf.copy()
364 | 
365 |     n = bf.shape[1]
366 |     if last_is_null:
367 |         n -= 1
368 |     if np.ndim(pi) == 0:
369 |         if pi > 1/n:
370 |             pi = 1/n
371 |         if last_is_null:
372 |             pi = np.r_[np.full(n, pi), 1-n*pi]
373 |         else:
374 |             pi = np.full(n, pi)
375 |     m = pi == 0
376 |     if any(m):
377 |         pi[m] = 1e-16
378 |         pi /= np.sum(pi)
379 |     if last_is_null:
380 |         bf -= bf[:, [-1]]
381 |     priors = np.tile(np.log(pi), [bf.shape[0], 1])
382 | 
383 |     x = bf + priors
384 |     mmax = np.max(x, 1, keepdims=True)
385 |     denom = mmax + np.log(np.sum(np.exp(x - mmax), 1, keepdims=True))
386 |     pp = np.exp(bf + priors - denom)
387 |     if cols is not None:
388 |         pp = pd.DataFrame(pp, columns=cols, index=index)
389 |     return pp
390 | 


--------------------------------------------------------------------------------
/qtl/core.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import os
 3 | 
 4 | 
 5 | def check_dependency(name):
 6 |     """"""
 7 |     e = subprocess.call(f"which {name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 8 |     if e != 0:
 9 |         raise RuntimeError(f"External dependency '{name}' not installed")
10 | 
11 | 
12 | def refresh_gcs_token():
13 |     """"""
14 |     t = subprocess.check_output('gcloud auth application-default print-access-token',
15 |                                 shell=True).decode().strip()
16 |     os.putenv('GCS_OAUTH_TOKEN', t)
17 | 


--------------------------------------------------------------------------------
/qtl/genotype.py:
--------------------------------------------------------------------------------
  1 | # Author: Francois Aguet
  2 | import numpy as np
  3 | import pandas as pd
  4 | import gzip
  5 | import subprocess
  6 | import os
  7 | import tempfile
  8 | 
  9 | MISSING = -9  # PLINK2 convention
 10 | gt_dosage_dict = {'0/0': 0, '0/1': 1, '1/1': 2, './.': MISSING,
 11 |                   '0|0': 0, '0|1': 1, '1|0': 1, '1|1': 2, '.|.': MISSING}
 12 | 
 13 | class GenotypeIndexer(object):
 14 |     def __init__(self, genotype_df, variant_df, sample_ids=None):
 15 |         self.genotype_df = genotype_df
 16 |         self.index_dict = {j:i for i,j in enumerate(variant_df.index)}
 17 |         self.variant_df = variant_df.copy()
 18 |         self.variant_df['index'] = np.arange(variant_df.shape[0])
 19 |         self.chr_variant_dfs = {c:g[['pos', 'index']] for c,g in self.variant_df.groupby('chrom')}
 20 |         if sample_ids is None:
 21 |             self.sample_ids = genotype_df.columns
 22 |             self.sample_ix = np.arange(genotype_df.shape[1])
 23 |         else:
 24 |             self.sample_ids = sample_ids
 25 |             self.sample_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids])
 26 | 
 27 |     def set_sample_ids(self, sample_ids):
 28 |         self.sample_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids])
 29 | 
 30 |     def get_indexes(self, variant_ids):
 31 |         return [self.index_dict[i] for i in variant_ids]
 32 | 
 33 |     def get_genotype(self, variant_id):
 34 |         return self.genotype_df.values[self.index_dict[variant_id], self.sample_ix]
 35 | 
 36 |     def get_genotypes(self, variant_ids):
 37 |         return self.genotype_df.values[[self.index_dict[i] for i in variant_ids]][:, self.sample_ix]
 38 | 
 39 |     def get_genotype_window(self, region_str):
 40 |         chrom, pos = region_str.split(':')
 41 |         start, end = pos.split('-')
 42 |         lb = np.searchsorted(self.chr_variant_dfs[chrom]['pos'].values, int(start))
 43 |         ub = np.searchsorted(self.chr_variant_dfs[chrom]['pos'].values, int(end), side='right')
 44 |         ub = np.minimum(ub, self.chr_variant_dfs[chrom].shape[0]-1)
 45 |         lb = self.chr_variant_dfs[chrom]['index'][lb]
 46 |         ub = self.chr_variant_dfs[chrom]['index'][ub]
 47 |         return self.genotype_df.iloc[lb:ub][self.sample_ids]
 48 | 
 49 | 
 50 | def get_sample_ids(vcf):
 51 |     """Get sample IDs"""
 52 |     if vcf.endswith('.bcf'):
 53 |         return subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split('\n')
 54 |     else:
 55 |         with gzip.open(vcf, 'rt') as f:
 56 |             for line in f:
 57 |                 if line[:2]=='##': continue
 58 |                 break
 59 |         return line.strip().split('\t')[9:]
 60 | 
 61 | 
 62 | def get_contigs(vcfpath):
 63 |     """Get list of contigs"""
 64 |     chrs = subprocess.check_output('tabix --list-chroms '+vcfpath, shell=True, executable='/bin/bash')
 65 |     return chrs.decode().strip().split()
 66 | 
 67 | 
 68 | def get_variant_ids(vcf):
 69 |     """Get list of variant IDs ('ID' field)"""
 70 |     s = subprocess.check_output(f'zcat {vcf} | grep -v "#" | cut -f3', shell=True)
 71 |     return s.strip(b'\n').split(b'\n')
 72 | 
 73 | 
 74 | def get_cis_genotypes(chrom, tss, vcf, field='GT', dosages=True, window=1000000):
 75 |     """Get genotypes in cis window (using tabix)"""
 76 |     region_str = chrom+':'+str(np.maximum(tss-window, 1))+'-'+str(tss+window)
 77 |     return get_genotypes_region(vcf, region_str, field=field, dosages=dosages)
 78 | 
 79 | 
 80 | def get_genotypes_region(vcf, region, field='GT', dosages=True):
 81 |     """Get genotypes, using region (chr:start-end) string"""
 82 |     s = subprocess.check_output(f'tabix {vcf} {region}',
 83 |                                 shell=True, executable='/bin/bash')
 84 |     s = s.decode().strip()
 85 |     if len(s) == 0:
 86 |         return None
 87 |     #     raise ValueError(f'No variants in region {region}')
 88 |     s = s .split('\n')
 89 |     variant_ids = [si.split('\t', 3)[-2] for si in s]
 90 |     field_ix = s[0].split('\t')[8].split(':').index(field)
 91 | 
 92 |     if dosages:
 93 |         if field == 'GT':
 94 |             s = [[gt_dosage_dict[i.split(':', field_ix+1)[field_ix]] for i in si.split('\t')[9:]] for si in s]
 95 |         elif field == 'DS':
 96 |             s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s]
 97 |         dtype = np.float32
 98 |     else:
 99 |         s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s]
100 |         dtype = str
101 | 
102 |     return pd.DataFrame(data=s, index=variant_ids, columns=get_sample_ids(vcf), dtype=dtype)
103 | 
104 | 
105 | def impute_mean(df, missing=lambda x: np.isnan(x), verbose=True):
106 |     """Row-wise mean imputation (in place). Missing values: np.nan by default."""
107 |     if isinstance(df, pd.DataFrame):
108 |         genotypes = df.values
109 |     else:
110 |         genotypes = df
111 | 
112 |     n = 0
113 |     for k,g in enumerate(genotypes,1):
114 |         ix = missing(g)
115 |         if np.any(ix):
116 |             g[ix] = np.mean(g[~ix])
117 |             n += 1
118 | 
119 |     if verbose and n > 0:
120 |         print(f'  * imputed at least 1 sample in {n} sites')
121 | 
122 | 
123 | def get_genotype(variant_id, vcf, field='GT', convert_gt=True, sample_ids=None):
124 |     """
125 |     Parse genotypes for given variant from VCF. Requires tabix.
126 | 
127 |       variant_id: {chr}_{pos}_{ref}_{alt}_{build}
128 |       vcf:        vcf path
129 |       field:      GT or DS
130 |       convert_gt: convert GT to dosages
131 |       sample_ids: VCF sample IDs
132 |     """
133 | 
134 |     chrom, pos = variant_id.split('_')[:2]
135 |     s = subprocess.check_output(f"tabix {vcf} {chrom}:{pos}-{pos}", shell=True)
136 |     if len(s) == 0:
137 |         raise ValueError(f"Variant '{variant_id}' not found in VCF.")
138 | 
139 |     s = s.decode().strip()
140 |     if '\n' in s:
141 |         s = s.split('\n')
142 |         try:
143 |             s = s[np.nonzero(np.array([i.split('\t',3)[-2] for i in s]) == variant_id)[0][0]]
144 |         except:
145 |             raise ValueError("Variant ID not found in VCF.")
146 |     s = s.split('\t')
147 |     fmt = s[8].split(':')
148 | 
149 |     if field == 'DS':
150 |         if 'DS' in fmt:
151 |             ds_ix = fmt.index('DS')
152 |             s = np.array([np.float32(i.split(':')[ds_ix]) for i in s[9:]])  # dosages
153 |         else:
154 |             raise ValueError('No dosage (DS) values found in VCF.')
155 |     # check format: use GT if DS not present
156 |     else:
157 |         assert fmt[0] == 'GT'
158 |         s = [i.split(':', 1)[0] for i in s[9:]]
159 | 
160 |         if convert_gt:
161 |             s = np.float32([gt_dosage_dict[i] for i in s])
162 | 
163 |     if sample_ids is None:
164 |         sample_ids = get_sample_ids(vcf)
165 |     s = pd.Series(s, index=sample_ids, name=variant_id)
166 | 
167 |     return s
168 | 
169 | 
170 | def get_genotypes(variant_ids, vcf, field='GT', drop_duplicates=True):
171 |     """"""
172 | 
173 |     variant_id_set = set(variant_ids)
174 | 
175 |     with tempfile.NamedTemporaryFile() as regions_file:
176 |         df = pd.DataFrame([i.split('_')[:2] for i in variant_id_set], columns=['chr', 'pos'])
177 |         df['pos'] = df['pos'].astype(int)
178 |         df = df.sort_values(['chr', 'pos'])
179 |         df.to_csv(regions_file.name, sep='\t', index=False, header=False)
180 |         s = subprocess.check_output(f'tabix {vcf} --regions {regions_file.name}', shell=True)
181 | 
182 |     s = s.decode().strip().split('\n')
183 |     s = [i.split('\t') for i in s]
184 |     variant_ids2 = [i[2] for i in s]
185 |     if field == 'GT':
186 |         gt_ix = s[0][8].split(':').index('GT')
187 |         dosages = [[gt_dosage_dict[j.split(':')[gt_ix]] for j in i[9:]] for i in s]
188 |     elif field == 'DS':
189 |         ds_ix = s[0][8].split(':').index('DS')
190 |         dosages = np.float32([[j.split(':')[ds_ix] for j in i[9:]] for i in s])
191 |     df = pd.DataFrame(dosages, index=variant_ids2, columns=get_sample_ids(vcf))
192 |     df = df[df.index.isin(variant_id_set)]
193 |     if drop_duplicates:
194 |         df = df[~df.index.duplicated()]
195 |     return df
196 | 
197 | 
198 | def get_allele_stats(genotype_df):
199 |     """Returns allele frequency, minor allele samples, and minor allele counts (row-wise)."""
200 |     # allele frequency
201 |     n2 = 2 * genotype_df.shape[1]
202 |     af = genotype_df.sum(1) / n2
203 |     # minor allele samples and counts
204 |     ix = af <= 0.5
205 |     m = genotype_df > 0.5
206 |     a = m.sum(1)
207 |     b = (genotype_df < 1.5).sum(1)
208 |     ma_samples = np.where(ix, a, b)
209 |     a = (genotype_df * m).sum(1).astype(int)
210 |     ma_count = np.where(ix, a, n2-a)
211 |     return af, ma_samples, ma_count
212 | 
213 | 
214 | def load_vcf(vcf, field='GT', dtype=None, verbose=False):
215 |     """Load VCF as DataFrame"""
216 | 
217 |     sample_ids = subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split()
218 |     n_samples = len(sample_ids)
219 |     n_variants = int(subprocess.check_output(f'bcftools index -n {vcf}', shell=True).decode())
220 | 
221 |     if dtype is None:
222 |         if field == 'GT':
223 |             dtype = np.int8
224 |         elif field == 'DS':
225 |             dtype = np.float32
226 |     dosages = np.zeros([n_variants, n_samples], dtype=dtype)
227 | 
228 |     variant_ids = []
229 |     with gzip.open(vcf, 'rt') as f:
230 |         for line in f:
231 |             if line.startswith('#'): continue  # skip header lines
232 |             break
233 | 
234 |         # parse format from first line
235 |         line = line.strip().split('\t')
236 |         if field not in line[8]:
237 |             raise ValueError(f"FORMAT does not include {field}. Available fields: {', '.join(line[8].split(':'))}")
238 |         format_ix = line[8].split(':').index(field)
239 |         variant_ids.append(line[2])
240 |         if field == 'GT':
241 |             dosages[0,:] = [gt_dosage_dict.get(i.split(':')[format_ix], MISSING) for i in line[9:]]
242 |         elif field == 'DS':
243 |             d = [i.split(':')[format_ix] for i in line[9:]]
244 |             d = [dtype(i) if i != '.' else dtype(MISSING) for i in d]
245 |             dosages[0,:] = d
246 | 
247 |         for k,line in enumerate(f, 1):
248 |             line = line.strip().split('\t')
249 |             variant_ids.append(line[2])
250 |             if field == 'GT':
251 |                 dosages[k,:] = [gt_dosage_dict.get(i.split(':')[format_ix], MISSING) for i in line[9:]]
252 |             elif field == 'DS':
253 |                 d = [i.split(':')[format_ix] for i in line[9:]]
254 |                 d = [dtype(i) if i != '.' else dtype(MISSING) for i in d]
255 |                 dosages[k,:] = d  # array?
256 |             if verbose and ((k+1) % 1000 == 0 or k+1 == n_variants):
257 |                 print(f'\rVariants parsed: {k+1:,}', end='' if k+1 < n_variants else None)
258 | 
259 |     return pd.DataFrame(dosages, index=variant_ids, columns=sample_ids)
260 | 


--------------------------------------------------------------------------------
/qtl/gtex.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import json
  4 | import urllib
  5 | import ssl
  6 | from collections.abc import Iterable
  7 | from matplotlib.colors import hsv_to_rgb, to_hex
  8 | 
  9 | 
 10 | def s2d(x):
 11 |     """Parse donor ID from sample ID"""
 12 |     if isinstance(x, str):
 13 |         return '-'.join(x.split('-')[:2])
 14 |     elif isinstance(x, Iterable):
 15 |         return ['-'.join(i.split('-')[:2]) for i in x]
 16 | 
 17 | 
 18 | def get_tissue_id(t):
 19 |     """Convert tissue name to tissue ID"""
 20 |     if isinstance(t, str):
 21 |         return t.replace('(','').replace(')','').replace(' - ', ' ').replace(' ', '_')
 22 |     elif isinstance(t, Iterable):
 23 |         return [i.replace('(','').replace(')','').replace(' - ', ' ').replace(' ', '_') for i in t]
 24 | 
 25 | 
 26 | def _get_api_data():
 27 |     context = ssl.create_default_context()
 28 |     context.check_hostname = False
 29 |     context.verify_mode = ssl.CERT_NONE
 30 |     tissues_json = json.loads(urllib.request.urlopen('https://gtexportal.org/api/v2/dataset/tissueSiteDetail',
 31 |                                                      context=context).read().decode())['data']
 32 |     return tissues_json
 33 | 
 34 | 
 35 | def get_colors_df(diff_brain=False):
 36 |     """Return pd.DataFrame mapping tissue IDs to colors"""
 37 |     tissues_json = _get_api_data()
 38 |     colors_df = pd.DataFrame(tissues_json).rename(columns={
 39 |         'tissueSiteDetailId':'tissue_id',
 40 |         'colorHex':'color_hex',
 41 |         'colorRgb':'color_rgb',
 42 |         'tissueSiteDetail':'tissue_site_detail',
 43 |         'tissueSiteDetailAbbr':'tissue_abbrv',
 44 |         'tissueSite':'tissue_site',
 45 |         'ontologyId':'ontology_id',
 46 |     }).set_index('tissue_id')
 47 |     colors_df = colors_df[['tissue_site', 'tissue_site_detail', 'tissue_abbrv', 'ontology_id', 'color_rgb', 'color_hex']]
 48 |     colors_df['color_hex'] = '#' + colors_df['color_hex']
 49 |     if diff_brain:
 50 |         rgb_s = pd.Series({
 51 |             'Brain_Amygdala':                        hsv_to_rgb([ 0.1,  1., 0.933]),
 52 |             'Brain_Anterior_cingulate_cortex_BA24':  hsv_to_rgb([ 0.11, 1., 0.933]),
 53 |             'Brain_Caudate_basal_ganglia':           hsv_to_rgb([ 0.12, 1., 0.933]),
 54 |             'Brain_Cerebellar_Hemisphere':           hsv_to_rgb([ 0.13, 1., 0.933]),
 55 |             'Brain_Cerebellum':                      hsv_to_rgb([ 0.13, 1., 0.933]),
 56 |             'Brain_Cortex':                          hsv_to_rgb([ 0.14, 1., 0.933]),
 57 |             'Brain_Frontal_Cortex_BA9':              hsv_to_rgb([ 0.14, 1., 0.933]),
 58 |             'Brain_Hippocampus':                     hsv_to_rgb([ 0.15, 1., 0.933]),
 59 |             'Brain_Hypothalamus':                    hsv_to_rgb([ 0.16, 1., 0.933]),
 60 |             'Brain_Nucleus_accumbens_basal_ganglia': hsv_to_rgb([ 0.17, 1., 0.933]),
 61 |             'Brain_Putamen_basal_ganglia':           hsv_to_rgb([ 0.18, 1., 0.933]),
 62 |             'Brain_Spinal_cord_cervical_c-1':        hsv_to_rgb([ 0.19, 1., 0.933]),
 63 |             'Brain_Substantia_nigra':                hsv_to_rgb([ 0.2,  1., 0.933]),
 64 |         })
 65 |         brain_tissues = [i for i in sorted(colors_df.index) if i.startswith('Brain')]
 66 |         colors_df.loc[brain_tissues, 'color_hex'] = rgb_s[brain_tissues].apply(lambda x: to_hex(x).upper())
 67 |         colors_df.loc[brain_tissues, 'color_rgb'] = rgb_s[brain_tissues].apply(
 68 |             lambda x: ','.join(np.round(x*255).astype(int).astype(str)))
 69 | 
 70 |     colors_df.index.name = 'tissue_id'
 71 |     colors_df.insert(3, 'tissue_title', colors_df['tissue_site_detail'].map(tissue_title_map))
 72 |     return colors_df
 73 | 
 74 | 
 75 | # Simplified tissue names for figures
 76 | tissue_title_map = {
 77 |     'Adipose - Subcutaneous': 'Subcutaneous adipose',
 78 |     'Adipose - Visceral (Omentum)': 'Visceral omentum',
 79 |     'Adrenal Gland': 'Adrenal gland',
 80 |     'Artery - Aorta': 'Aorta',
 81 |     'Artery - Coronary': 'Coronary artery',
 82 |     'Artery - Tibial': 'Tibial artery',
 83 |     'Bladder': 'Bladder',
 84 |     'Brain - Amygdala': 'Amygdala',
 85 |     'Brain - Anterior cingulate cortex (BA24)': 'Anterior cingulate cortex',
 86 |     'Brain - Caudate (basal ganglia)': 'Caudate (basal ganglia)',
 87 |     'Brain - Cerebellar Hemisphere': 'Cerebellar hemisphere',
 88 |     'Brain - Cerebellum': 'Cerebellum',
 89 |     'Brain - Cortex': 'Cortex',
 90 |     'Brain - Frontal Cortex (BA9)': 'Frontal cortex (BA9)',
 91 |     'Brain - Hippocampus': 'Hippocampus',
 92 |     'Brain - Hypothalamus': 'Hypothalamus',
 93 |     'Brain - Nucleus accumbens (basal ganglia)': 'Nucleus accumbens (basal ganglia)',
 94 |     'Brain - Putamen (basal ganglia)': 'Putamen (basal ganglia)',
 95 |     'Brain - Spinal cord (cervical c-1)': 'Spinal cord (cervical c-1)',
 96 |     'Brain - Substantia nigra': 'Substantia nigra',
 97 |     'Breast - Mammary Tissue': 'Breast mammary tissue',
 98 |     'Cells - EBV-transformed lymphocytes': 'EBV-transformed lymphocytes',
 99 |     'Cells - Cultured fibroblasts': 'Cultured fibroblasts',
100 |     'Cervix - Ectocervix': 'Ectocervix',
101 |     'Cervix - Endocervix': 'Endocervix',
102 |     'Colon - Sigmoid': 'Sigmoid colon',
103 |     'Colon - Transverse': 'Transverse colon',
104 |     'Esophagus - Gastroesophageal Junction': 'Gastroesophageal junction',
105 |     'Esophagus - Mucosa': 'Esophagus mucosa',
106 |     'Esophagus - Muscularis': 'Esophagus muscularis',
107 |     'Fallopian Tube': 'Fallopian tube',
108 |     'Heart - Atrial Appendage': 'Atrial appendage',
109 |     'Heart - Left Ventricle': 'Left ventricle',
110 |     'Kidney - Cortex': 'Kidney cortex',
111 |     'Kidney - Medulla': 'Kidney medulla',
112 |     'Liver': 'Liver',
113 |     'Lung': 'Lung',
114 |     'Minor Salivary Gland': 'Minor salivary gland',
115 |     'Muscle - Skeletal': 'Skeletal muscle',
116 |     'Nerve - Tibial': 'Tibial nerve',
117 |     'Ovary': 'Ovary',
118 |     'Pancreas': 'Pancreas',
119 |     'Pituitary': 'Pituitary',
120 |     'Prostate': 'Prostate',
121 |     'Skin - Not Sun Exposed (Suprapubic)': 'Not sun-exposed skin (suprapubic)',
122 |     'Skin - Sun Exposed (Lower leg)': 'Sun-exposed skin (lower leg)',
123 |     'Small Intestine - Terminal Ileum': 'Small intestine terminal ileum',
124 |     'Spleen': 'Spleen',
125 |     'Stomach': 'Stomach',
126 |     'Testis': 'Testis',
127 |     'Thyroid': 'Thyroid',
128 |     'Uterus': 'Uterus',
129 |     'Vagina': 'Vagina',
130 |     'Whole Blood': 'Whole blood',
131 | }
132 | 
133 | 
134 | entex_tissue_map = {
135 |     "Peyer's patch": 'Small Intestine - Terminal Ileum',
136 |     'adrenal gland': 'Adrenal Gland',
137 |     'ascending aorta': 'Artery - Aorta',  # correct mapping?
138 |     'body of pancreas': 'Pancreas',
139 |     'breast epithelium': 'Breast - Mammary Tissue',
140 |     'coronary artery': 'Artery - Coronary',
141 |     'esophagus muscularis mucosa': 'Esophagus - Muscularis',
142 |     'esophagus squamous epithelium': 'Esophagus - Mucosa',
143 |     'gastrocnemius medialis': 'Muscle - Skeletal',
144 |     'gastroesophageal sphincter': 'Esophagus - Gastroesophageal Junction',
145 |     'heart left ventricle': 'Heart - Left Ventricle',
146 |     'lower leg skin': 'Skin - Sun Exposed (Lower leg)',
147 |     'omental fat pad': 'Adipose - Visceral (Omentum)',
148 |     'ovary': 'Ovary',
149 |     'prostate gland': 'Prostate',
150 |     'right atrium auricular region': 'Heart - Atrial Appendage',
151 |     'right lobe of liver': 'Liver',
152 |     'sigmoid colon': 'Colon - Sigmoid',
153 |     'spleen': 'Spleen',
154 |     'stomach': 'Stomach',
155 |     'subcutaneous adipose tissue': 'Adipose - Subcutaneous',
156 |     'suprapubic skin': 'Skin - Not Sun Exposed (Suprapubic)',
157 |     'testis': 'Testis',
158 |     'thoracic aorta': 'Artery - Aorta',  # correct mapping?
159 |     'thyroid gland': 'Thyroid',
160 |     'tibial artery': 'Artery - Tibial',
161 |     'tibial nerve': 'Nerve - Tibial',
162 |     'transverse colon': 'Colon - Transverse',
163 |     'upper lobe of left lung': 'Lung',
164 |     'uterus': 'Uterus',
165 |     'vagina': 'Vagina'
166 | }
167 | 


--------------------------------------------------------------------------------
/qtl/io.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | import subprocess
  5 | import gzip
  6 | 
  7 | 
  8 | def to_bgzip(df, path, header=True, float_format=None):
  9 |     """Write DataFrame to bgzip"""
 10 |     assert path.endswith('.gz')
 11 |     bgzip = subprocess.Popen(f"bgzip -c > {path}", stdin=subprocess.PIPE, shell=True, encoding='utf8')
 12 |     df.to_csv(bgzip.stdin, sep='\t', index=False, header=header, float_format=float_format)
 13 |     stdout, stderr = bgzip.communicate()
 14 |     subprocess.check_call(f"tabix -f {path}", shell=True)
 15 | 
 16 | 
 17 | def sort_bed(bed_df, inplace=True):
 18 |     """Sort BED DataFrame"""
 19 |     sorted_df = bed_df.sort_values(['chr', 'start', 'end'], key=lambda x:
 20 |                     x.str.replace('chr','').str.replace('X','23').astype(int) if x.dtype == object else x,
 21 |                     inplace=inplace)
 22 |     if inplace:
 23 |         bed_df.reset_index(drop=True, inplace=True)
 24 |     else:
 25 |         sorted_df.reset_index(drop=True, inplace=True)
 26 |         return sorted_df
 27 | 
 28 | 
 29 | def write_bed(bed_df, output_name, header=True, float_format=None):
 30 |     """Write DataFrame to BED format"""
 31 |     if header:  
 32 |         assert (bed_df.columns[0] == 'chr' or bed_df.columns[0] == '#chr') and bed_df.columns[1] == 'start' and bed_df.columns[2] == 'end'
 33 |         # header must be commented in BED format
 34 |         header = bed_df.columns.values.copy()
 35 |         header[0] = '#chr'
 36 |     to_bgzip(bed_df, output_name, header=header, float_format=float_format)
 37 | 
 38 | 
 39 | def read_gct(gct_file, sample_ids=None, dtype=None, load_description=True, skiprows=2):
 40 |     """Load GCT as DataFrame"""
 41 |     if sample_ids is not None:
 42 |         sample_ids = ['Name'] + list(sample_ids)
 43 | 
 44 |     if gct_file.endswith('.gct.gz') or gct_file.endswith('.gct'):
 45 |         if dtype is not None:
 46 |             with gzip.open(gct_file, 'rt') as gct:
 47 |                 for _ in range(skiprows):
 48 |                     gct.readline()
 49 |                 sample_ids = gct.readline().strip().split()
 50 |             dtypes = {i:dtype for i in sample_ids[2:]}
 51 |             dtypes['Name'] = str
 52 |             dtypes['Description'] = str
 53 |             df = pd.read_csv(gct_file, sep='\t', skiprows=skiprows, usecols=sample_ids, index_col=0, dtype=dtypes)
 54 |         else:
 55 |             df = pd.read_csv(gct_file, sep='\t', skiprows=skiprows, usecols=sample_ids, index_col=0)
 56 |     elif gct_file.endswith('.parquet'):
 57 |         df = pd.read_parquet(gct_file, columns=sample_ids)
 58 |     else:
 59 |         raise ValueError('Unsupported input format.')
 60 |     if not load_description and 'Description' in df.columns:
 61 |         df.drop('Description', axis=1, inplace=True)
 62 |     return df
 63 | 
 64 | 
 65 | def write_gct(df, gct_file, float_format='%.6g', compresslevel=6):
 66 |     """Write DataFrame to GCT format"""
 67 |     assert df.index.name == 'Name' and df.columns[0] == 'Description'
 68 |     if gct_file.endswith('.gct.gz'):
 69 |         opener = gzip.open(gct_file, 'wt', compresslevel=compresslevel)
 70 |     else:
 71 |         opener = open(gct_file, 'w')
 72 | 
 73 |     with opener as gct:
 74 |         gct.write(f'#1.2\n{df.shape[0]:d}\t{df.shape[1]-1:d}\n')
 75 |         df.to_csv(gct, sep='\t', float_format=float_format)
 76 | 
 77 | 
 78 | def gtf_to_tss_bed(annotation_gtf, feature='gene', exclude_chrs=[], phenotype_id='gene_id'):
 79 |     """Parse genes and TSSs from GTF and return DataFrame for BED output"""
 80 |     chrom = []
 81 |     start = []
 82 |     end = []
 83 |     gene_id = []
 84 |     gene_name = []
 85 | 
 86 |     if annotation_gtf.endswith('.gz'):
 87 |         opener = gzip.open(annotation_gtf, 'rt')
 88 |     else:
 89 |         opener = open(annotation_gtf, 'r')
 90 | 
 91 |     with opener as gtf:
 92 |         for row in gtf:
 93 |             row = row.strip().split('\t')
 94 |             if row[0][0] == '#' or row[2] != feature: continue # skip header
 95 |             chrom.append(row[0])
 96 | 
 97 |             # TSS: gene start (0-based coordinates for BED)
 98 |             if row[6] == '+':
 99 |                 start.append(np.int64(row[3])-1)
100 |                 end.append(np.int64(row[3]))
101 |             elif row[6] == '-':
102 |                 start.append(np.int64(row[4])-1)  # last base of gene
103 |                 end.append(np.int64(row[4]))
104 |             else:
105 |                 raise ValueError('Strand not specified.')
106 | 
107 |             attributes = defaultdict()
108 |             for a in row[8].replace('"', '').split(';')[:-1]:
109 |                 kv = a.strip().split(' ')
110 |                 if kv[0]!='tag':
111 |                     attributes[kv[0]] = kv[1]
112 |                 else:
113 |                     attributes.setdefault('tags', []).append(kv[1])
114 | 
115 |             gene_id.append(attributes['gene_id'])
116 |             gene_name.append(attributes['gene_name'])
117 | 
118 |     if phenotype_id == 'gene_id':
119 |         bed_df = pd.DataFrame(data={'chr':chrom, 'start':start, 'end':end, 'gene_id':gene_id}, columns=['chr', 'start', 'end', 'gene_id'], index=gene_id)
120 |     elif phenotype_id == 'gene_name':
121 |         bed_df = pd.DataFrame(data={'chr':chrom, 'start':start, 'end':end, 'gene_id':gene_name}, columns=['chr', 'start', 'end', 'gene_id'], index=gene_name)
122 |     # drop rows corresponding to excluded chromosomes
123 |     mask = np.ones(len(chrom), dtype=bool)
124 |     for k in exclude_chrs:
125 |         mask = mask & (bed_df['chr']!=k)
126 |     bed_df = bed_df[mask]
127 | 
128 |     # sort by start position
129 |     bed_df = bed_df.groupby('chr', sort=False, group_keys=False).apply(lambda x: x.sort_values('start'))
130 | 
131 |     return bed_df
132 | 


--------------------------------------------------------------------------------
/qtl/locusplot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """locusplot.py: LocusZoom-style visualization of the p-value landscape for multiple QTL or GWAS"""
  4 | 
  5 | __author__ = "Francois Aguet"
  6 | __copyright__ = "Copyright 2019, The Broad Institute"
  7 | __license__ = "BSD3"
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib as mpl
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.ticker as ticker
 14 | import matplotlib.patches as patches
 15 | from cycler import cycler
 16 | import seaborn as sns
 17 | import argparse
 18 | import subprocess
 19 | import os
 20 | import io
 21 | import gzip
 22 | import re
 23 | from collections.abc import Iterable
 24 | 
 25 | from . import annotation
 26 | from . import genotype as gt
 27 | from . import plot
 28 | 
 29 | 
 30 | def get_sample_ids(vcf):
 31 |     """Get sample IDs from VCF"""
 32 |     if vcf.endswith('.bcf'):
 33 |         return subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split('\n')
 34 |     else:
 35 |         with gzip.open(vcf, 'rt') as f:
 36 |             for line in f:
 37 |                 if line[:2] == '##': continue
 38 |                 break
 39 |         return line.strip().split('\t')[9:]
 40 | 
 41 | 
 42 | def get_cis_genotypes(chrom, tss, vcf, field='GT', window=1000000):
 43 |     """Get dosages from VCF (using tabix)"""
 44 |     region_str = chrom+':'+str(np.maximum(tss-window, 1))+'-'+str(tss+window)
 45 |     return get_genotypes_region(vcf, region_str, field=field)
 46 | 
 47 | 
 48 | def get_genotypes_region(vcf, region, field='GT'):
 49 |     """Get dosages from VCF (using tabix)"""
 50 |     print(f'Getting {field} for region {region}')
 51 |     cmd = 'tabix '+vcf+' '+region
 52 |     s = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
 53 |     s = s.decode().strip()
 54 |     if len(s) == 0:
 55 |         raise ValueError(f'No variants in region {region}')
 56 |     s = s .split('\n')
 57 |     variant_ids = [si.split('\t', 3)[-2] for si in s]
 58 |     field_ix = s[0].split('\t')[8].split(':').index(field)
 59 | 
 60 |     if field == 'GT':
 61 |         gt_map = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan,
 62 |                   '0|0':0, '0|1':1, '1|0':1, '1|1':2, '.|.':np.nan}
 63 |         s = [[gt_map[i.split(':', field_ix+1)[field_ix]] for i in si.split('\t')[9:]] for si in s]
 64 |     else:
 65 |         s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s]
 66 | 
 67 |     return pd.DataFrame(data=s, index=variant_ids, columns=get_sample_ids(vcf), dtype=np.float32)
 68 | 
 69 | 
 70 | def load_eqtl(eqtl_file, gene_id, chrom=None):
 71 |     """Load full eQTL or ieQTL summary statistics for the specified gene"""
 72 |     if eqtl_file.endswith('parquet'):
 73 |         p = eqtl_file
 74 |         if chrom is not None:
 75 |             p = eqtl_file.replace(re.findall('chr\d+', eqtl_file)[0], chrom)
 76 |         cols = ['phenotype_id', 'variant_id', 'pval_gi', 'pval_nominal']
 77 |         eqtl_df = pd.read_parquet(p, columns=cols)
 78 |         eqtl_df = eqtl_df[eqtl_df['phenotype_id'] == gene_id].set_index('variant_id').rename(columns={'pval_gi':'pval_nominal'})
 79 |     else:
 80 |         s = subprocess.check_output(f'zcat {eqtl_file} | grep {gene_id}', shell=True).decode()
 81 |         eqtl_cols = ['gene_id', 'variant_id', 'tss_distance', 'ma_samples', 'ma_count', 'maf', 'pval_nominal', 'slope', 'slope_se']
 82 |         eqtl_df = pd.read_csv(io.StringIO(s), sep='\t', header=None, names=eqtl_cols, index_col=1)
 83 |     eqtl_df['position'] = eqtl_df.index.map(lambda x: int(x.split('_')[1]))
 84 |     return eqtl_df
 85 | 
 86 | 
 87 | def load_gwas(gwas_file, variant_ids):
 88 |     """Load GWAS summary statistics"""
 89 |     gwas_df = pd.read_csv(gwas_file, sep='\t', usecols=['panel_variant_id', 'position', 'pvalue', 'frequency', 'sample_size'], index_col=0)
 90 |     gwas_df = gwas_df.loc[gwas_df.index.isin(variant_ids)].rename(columns={'pvalue':'pval_nominal', 'frequency':'maf'})
 91 |     gwas_df['maf'] = np.where(gwas_df['maf']<=0.5, gwas_df['maf'], 1-gwas_df['maf'])
 92 |     return gwas_df
 93 | 
 94 | 
 95 | def compute_ld(genotype_df, variant_id):
 96 |     """Compute LD (r2)"""
 97 |     # return genotype_df.corrwith(genotype_df.loc[variant_id], axis=1, method='pearson')**2
 98 |     g0 = genotype_df - genotype_df.values.mean(1, keepdims=True)
 99 |     d = (g0**2).sum(1) * (g0.loc[variant_id]**2).sum()
100 |     return (g0 * g0.loc[variant_id]).sum(1)**2 / d
101 | 
102 | 
103 | def get_ld(vcf, variant_id, phenotype_bed, window=200000):
104 |     """Load genotypes and compute LD (r2)"""
105 |     phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, nrows=0).drop(['#chr', 'start', 'end'], axis=1)
106 |     chrom, pos, _, _, _ = variant_id.split('_')
107 |     pos = int(pos)
108 |     genotype_df = get_cis_genotypes(chrom, pos, vcf, window=window)[phenotype_df.columns]
109 |     gt.impute_mean(genotype_df, verbose=False)
110 |     r2_s = compute_ld(genotype_df, variant_id)
111 |     return r2_s
112 | 
113 | 
114 | def get_rsid(id_lookup_table, variant_id):
115 |     s = subprocess.check_output(f'zcat {id_lookup_table} | grep {variant_id}', shell=True).decode()
116 |     rs_id = [i for i in s.strip().split('\t') if i.startswith('rs')]
117 |     assert len(rs_id) == 1
118 |     return rs_id[0]
119 | 
120 | 
121 | def compare_loci(pval_df1, pval_df2, r2_s, variant_id=None, rs_id=None,
122 |                  highlight_ids=None, colorbar=True, ah=2, aw=2):
123 |     """plot similar to LocusCompare (Liu et al., Nat Genet, 2019)"""
124 |     assert pval_df1.index.equals(pval_df2.index)
125 | 
126 |     dl = 0.75
127 |     dr = 0.75
128 |     db = 0.75
129 |     dt = 0.25
130 |     fw = dl + aw + dr
131 |     fh = db + ah + dt
132 | 
133 |     fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
134 |     ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
135 | 
136 |     # LocusZoom colors
137 |     lz_colors = ["#7F7F7F", "#282973", "#8CCCF0", "#69BD45", "#F9A41A", "#ED1F24"]
138 |     select_args = {'s':24, 'marker':'D', 'c':"#714A9D", 'edgecolor':'k', 'lw':0.25}
139 |     highlight_args = {'s':24, 'marker':'D', 'edgecolor':'k', 'lw':0.25}
140 |     cmap = mpl.colors.ListedColormap(lz_colors)
141 |     bounds = np.append(-1, np.arange(0,1.2,0.2))
142 |     norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
143 | 
144 |     if colorbar:
145 |         s = 0.66
146 |         cax = fig.add_axes([(dl+aw+0.2)/fw, (db+ah-1.25*s)/fh, s*0.25/fw, s*1.25/fh])
147 |         cb = mpl.colorbar.ColorbarBase(cax, cmap=cmap,
148 |                                        norm=norm,
149 |                                        boundaries=bounds[1:],  # start at 0
150 |                                        ticks=bounds[1:],
151 |                                        spacing='proportional',
152 |                                        orientation='vertical')
153 |         cax.set_title('r$\mathregular{^2}$', fontsize=12)
154 |         cax.set_ylim([0,1])
155 | 
156 |     if rs_id is not None:
157 |         t = rs_id
158 |     elif variant_id is not None:  # reformat variant ID
159 |         t = variant_id.split('_b')[0].replace('_',':',1).replace('_','-')
160 | 
161 |     x = -np.log10(pval_df1['pval_nominal'])
162 |     y = -np.log10(pval_df2['pval_nominal'])
163 | 
164 |     # sort variants by LD; plot high LD in front
165 |     s = r2_s[x.index].sort_values().index
166 |     ax.scatter(x[s], y[s], c=r2_s[s].replace(np.nan, -1), s=20, cmap=cmap, norm=norm, edgecolor='k', lw=0.25, clip_on=False)
167 | 
168 |     if highlight_ids is not None:
169 |         ax.scatter(x[highlight_ids], y[highlight_ids], **highlight_args, clip_on=False)
170 | 
171 |     if variant_id is not None:
172 |         x = -np.log10(pval_df1.loc[variant_id, 'pval_nominal'])
173 |         y = -np.log10(pval_df2.loc[variant_id, 'pval_nominal'])
174 |         ax.scatter(x, y, **select_args)
175 |         txt = ax.annotate(t, (x, y), xytext=(-5,5), textcoords='offset points', ha='right')
176 |     else:  # annotate lead variants
177 |         v = pval_df1['pval_nominal'].idxmin()
178 |         x = -np.log10(pval_df1.loc[v, 'pval_nominal'])
179 |         y = -np.log10(pval_df2.loc[v, 'pval_nominal'])
180 |         t = v.split('_b')[0].replace('_',':',1).replace('_','-')
181 |         # ax.scatter(x, y, **select_args)
182 |         txt = ax.annotate(t, (x, y), xytext=(5,5), textcoords='offset points', ha='left')
183 |         v = pval_df2['pval_nominal'].idxmin()
184 |         x = -np.log10(pval_df1.loc[v, 'pval_nominal'])
185 |         y = -np.log10(pval_df2.loc[v, 'pval_nominal'])
186 |         t = v.split('_b')[0].replace('_',':',1).replace('_','-')
187 |         # ax.scatter(x, y, **select_args)
188 |         txt = ax.annotate(t, (x, y), xytext=(-5,5), textcoords='offset points', ha='right')
189 | 
190 |     ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=4, nbins=5))
191 |     ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=4, nbins=5))
192 | 
193 |     ax.set_xlabel('-log$\mathregular{_{10}}$(p-value)', fontsize=12)
194 |     ax.set_ylabel('-log$\mathregular{_{10}}$(p-value)', fontsize=12)
195 | 
196 |     ax.set_xlim([0, ax.get_xlim()[1]])
197 |     ax.set_ylim([0, ax.get_ylim()[1]])
198 |     ax.spines['left'].set_position(('outward', 6))
199 |     ax.spines['bottom'].set_position(('outward', 6))
200 |     ax.spines['top'].set_visible(False)
201 |     ax.spines['right'].set_visible(False)
202 | 
203 |     return ax
204 | 
205 | 
206 | def plot_locus(pvals, variant_ids=None, gene=None, r2_s=None, rs_id=None,
207 |                highlight_ids=None, credible_sets=None, show_lead=True, show_rsid=True, label_first_only=False,
208 |                tracks=None, track_colors=None, shared_only=True, show_effect=False,
209 |                xlim=None, ymax=None, miny=5, sharey=None, labels=None, label_fontsize=12, title=None, shade_range=None, shade_color='#cecece',
210 |                label_pos='left', gene_label_pos=None, chr_label_pos='bottom', window=200000, colorbar=True, gene_scale=0.33,
211 |                dl=0.75, aw=4, dr=0.75, db=0.5, ah=1.25, dt=0.25, ds=0.1, gh=0.2, th=1.5, ytext=5,
212 |                single_ylabel=False, ylabel='-log$\mathregular{_{10}}$(p-value)', rasterized=False):
213 |     """
214 |       pvals: pd.DataFrame, or list of pd.DataFrame. Must contain 'pval_nominal' and 'position' columns.
215 |       variant_ids:
216 |       gene: qtl.annotation.Gene, or list thereof
217 |       tracks:
218 |       track_colors:
219 |       shared_only: only plot variants that are present in all inputs
220 |       sharey: list of dataset indexes with shared ylim
221 |       show_effect: indicate effect direction of lead variant with up/down arrow
222 |     """
223 | 
224 |     if isinstance(pvals, pd.DataFrame):
225 |         pvals = [pvals]
226 |     n = len(pvals)
227 |     if not isinstance(gene, Iterable):
228 |         gene = [gene]
229 | 
230 |     if variant_ids is None:
231 |         variant_ids = []
232 |         for p in pvals:
233 |             if 'pval_nominal' in p:
234 |                 if p['pval_nominal'].max() > 1:  # assume -log10(P)
235 |                     variant_ids.append(p['pval_nominal'].idxmax())
236 |                 else:
237 |                     variant_ids.append(p['pval_nominal'].idxmin())
238 |             elif 'pip' in p:
239 |                 variant_ids.append(p['pip'].idxmax())
240 |             else:
241 |                 variant_ids.append(None)
242 |     elif isinstance(variant_ids, str):
243 |         variant_ids = [variant_ids]*n
244 | 
245 |     i = [i for i,p in enumerate(pvals) if variant_ids[0] in p.index]
246 |     if i:
247 |         chrom, pos = pvals[i[0]].loc[variant_ids[0], ['chr', 'position']]
248 |         pos = int(pos)
249 |     else:
250 |         raise ValueError(f"{variant_ids[0]} not found in any of the inputs.")
251 | 
252 |     # set up figure
253 |     if chr_label_pos != 'bottom':
254 |         db = 0.25
255 |         dt = 0.5
256 |     fw = dl + aw + dr
257 |     fh = db + n*ah + (n-1)*ds + dt
258 |     if gene[0] is not None:
259 |         fh += gh
260 |     else:
261 |         gh = 0
262 |     if tracks is not None:
263 |         fh += th + ds
264 |     fig = plt.figure(figsize=(fw,fh), facecolor='none')
265 |     axes = [fig.add_axes([dl/fw, (fh-dt-ah)/fh, aw/fw, ah/fh])]
266 |     plot.format_plot(axes[-1], y_offset=6)
267 |     for i in range(1,n):
268 |         axes.append(fig.add_axes([dl/fw, (fh-dt-ah-i*(ah+ds))/fh, aw/fw, ah/fh], sharex=axes[0], facecolor='none'))
269 |         plot.format_plot(axes[-1], y_offset=6)
270 |     if tracks is not None:
271 |         tax = fig.add_axes([dl/fw, (fh-dt-n*(ah+ds)-th)/fh, aw/fw, th/fh], sharex=axes[0], facecolor='none')
272 |     if gene[0] is not None:
273 |         gax = fig.add_axes([dl/fw, (db)/fh, aw/fw, gh/fh], sharex=axes[0], facecolor='none', label='Gene')
274 | 
275 |     if xlim is None:
276 |         xlim = np.array([pos-window, pos+window])
277 |     axes[0].set_xlim(xlim)
278 |     axes[0].xaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=3, nbins=4))
279 | 
280 |     # LocusZoom colors
281 |     lz_colors = ["#7F7F7F", "#282973", "#8CCCF0", "#69BD45", "#F9A41A", "#ED1F24"]
282 |     select_args = {'s':24, 'marker':'D', 'c':"#714A9D", 'edgecolor':'k', 'lw':0.25}
283 |     highlight_args = {'s':24, 'marker':'D', 'edgecolor':'k', 'lw':0.25}
284 |     cmap = mpl.colors.ListedColormap(lz_colors)
285 |     bounds = np.append(-1, np.arange(0,1.2,0.2))
286 |     norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
287 | 
288 |     if colorbar:
289 |         s = 0.66
290 |         cax = fig.add_axes([(dl+aw+0.1)/fw, (fh-dt-ah+(1-s)/2*ah)/fh, s*ah/5/fw, s*ah/fh])
291 |         cb = mpl.colorbar.ColorbarBase(cax, cmap=cmap,
292 |                                        norm=norm,
293 |                                        boundaries=bounds[1:],  # start at 0
294 |                                        ticks=bounds[1:],
295 |                                        spacing='proportional',
296 |                                        orientation='vertical')
297 |         cax.set_ylim([0,1])
298 |         cax.set_title('r$\mathregular{^2}$', fontsize=12)
299 | 
300 |     # common set of variants
301 |     common_ix = pvals[0].index
302 |     for pval_df in pvals[1:]:
303 |         common_ix = common_ix[common_ix.isin(pval_df.index)]
304 | 
305 |     # plot p-values
306 |     ylabels = []
307 |     for k,(ax,variant_id,pval_df) in enumerate(zip(axes, variant_ids, pvals)):
308 |         # select variants in window
309 |         m = (pval_df['position'] >= xlim[0]) & (pval_df['position'] <= xlim[1])
310 |         if shared_only:
311 |             m &= pval_df.index.isin(common_ix)
312 |         window_df = pval_df.loc[m]
313 |         x = window_df['position']
314 |         if 'pval_nominal' in pval_df:
315 |             if pval_df['pval_nominal'].max() > 1:  # assume values are already -log10(P)
316 |                 p = window_df['pval_nominal']
317 |                 minp = pval_df.loc[variant_id, 'pval_nominal']
318 |             else:
319 |                 p = -np.log10(window_df['pval_nominal'])
320 |                 minp = -np.log10(pval_df.loc[variant_id, 'pval_nominal'])
321 |             ylabels.append(ylabel)
322 | 
323 |             # sort variants by LD; plot high LD in front
324 |             if r2_s is not None:
325 |                 s = r2_s[window_df.index].sort_values().index
326 |                 r2 = r2_s[s].replace(np.nan, -1)
327 |             elif 'r2' in pval_df:
328 |                 s = pval_df.loc[window_df.index, 'r2'].sort_values(na_position='first').index
329 |                 r2 = pval_df.loc[s, 'r2'].replace(np.nan, -1)
330 |             else:
331 |                 s = window_df.index
332 |                 r2 = pd.Series(-1, index=s)
333 |             ax.scatter(x[s], p[s], c=r2, s=20, cmap=cmap, norm=norm, edgecolor='k', lw=0.25, rasterized=rasterized)
334 | 
335 |         elif 'pip' in pval_df:
336 |             p = window_df['pip']
337 |             ylabels.append('PIP')
338 |             minp = pval_df.loc[variant_id, 'pip']
339 |             if 'cs_id' in pval_df:
340 |                 pip_df = pval_df[pval_df['cs_id'].notnull()].copy()
341 |                 cs_ix, cs_id = pd.factorize(pip_df['cs_id'])
342 |                 if len(cs_id) < 10:
343 |                     cs_colors = sns.color_palette('Set1', desat=0.66).as_hex()
344 |                 else:
345 |                     cs_colors = sns.color_palette('tab20b', desat=1).as_hex()
346 |                 cs_cmap = mpl.colors.ListedColormap(cs_colors)
347 |                 cs_norm = mpl.colors.BoundaryNorm(np.arange(1, cs_cmap.N+1), cs_cmap.N)
348 |                 ax.scatter(pip_df['position'], pip_df['pip'], c=pip_df['cs_id'].map(pd.Series(range(len(cs_id)), index=cs_id)),
349 |                            s=22, ec='none', cmap=cs_cmap, norm=cs_norm, rasterized=rasterized, clip_on=False)
350 |             else:
351 |                 raise NotImplementedError
352 | 
353 |         if credible_sets is not None:
354 |             df = pval_df.loc[credible_sets[k]['variant_id']]
355 |             ax.scatter(df['position'], -np.log10(df['pval_nominal']), c=credible_sets[k]['cs_id']/10, s=50)
356 |             # credible_sets[k]['variant_id']
357 | 
358 |         if highlight_ids is not None:  # plot relative to lead variant
359 |             if isinstance(highlight_ids, str):
360 |                 highlight_ids = [highlight_ids]
361 |             highlight_df = pval_df.loc[highlight_ids].copy()
362 |             highlight_df = highlight_df[~highlight_df.index.isin(variant_ids)]  # drop lead variant
363 |             ix = highlight_df.index
364 |             if 'pip' not in pval_df:
365 |                 ax.scatter(x[ix], p[ix], c=r2[ix], cmap=cmap, norm=norm, **highlight_args)
366 |             else:
367 |                 if 'cs_id' in pval_df:  # only plot highlight IDs that are in CSs
368 |                     ix = highlight_df.index[highlight_df.index.isin(pip_df.index)]
369 |                     ax.scatter(x[ix], p[ix], c='goldenrod', **highlight_args)
370 |                 else:
371 |                     ax.scatter(x[ix], p[ix], c='goldenrod', **highlight_args)
372 | 
373 |         # plot lead/selected variant, add text label, etc.
374 |         if show_lead:
375 |             if variant_id in pval_df.index:
376 |                 minpos = pval_df.loc[variant_id, 'position']
377 |             else:
378 |                 minpos = None
379 | 
380 |             if 'pip' not in pval_df:
381 |                 ax.scatter(minpos, minp, **select_args)
382 |             elif minpos is not None:  # highlight lead variant for each CS
383 |                 pip_df2 = pip_df.loc[pip_df.groupby('cs_id').apply(lambda x: x['pip'].idxmax())]
384 |                 ax.scatter(pip_df2['position'], pip_df2['pip'], c=pip_df2['cs_id'].map(pd.Series(range(len(cs_id)), index=cs_id)).values,
385 |                            cmap=cs_cmap, norm=cs_norm, s=24, marker='D', ec='k', lw=0.25)
386 | 
387 |                 if k == 0 or not label_first_only:
388 |                     for i,r in pip_df2.iterrows():
389 |                         i = i.split('_b')[0].replace('_',':',1).replace('_','-')
390 |                         if (r['position']-xlim[0])/(xlim[1]-xlim[0]) < 0.55:  # right
391 |                             txt = ax.annotate(i, (r['position'], r['pip']), xytext=(5,5), textcoords='offset points')
392 |                         else:
393 |                             txt = ax.annotate(i, (r['position'], r['pip']), xytext=(-5,5), ha='right', textcoords='offset points')
394 |                         txt.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1"))
395 | 
396 |             if rs_id is not None:
397 |                 if isinstance(rs_id, str):
398 |                     t = rs_id
399 |                 else:
400 |                     t = rs_id[k]
401 |             else:
402 |                 t = variant_id.split('_b')[0].replace('_',':',1).replace('_','-')
403 | 
404 |             if show_rsid and minpos is not None and 'pip' not in pval_df and (k == 0 or not label_first_only):  # text label
405 |                 if (minpos-xlim[0])/(xlim[1]-xlim[0]) < 0.55:  # right
406 |                     txt = ax.annotate(t, (minpos, minp), xytext=(5,ytext), textcoords='offset points')
407 |                 else:
408 |                     txt = ax.annotate(t, (minpos, minp), xytext=(-5,ytext), ha='right', textcoords='offset points')
409 |                 txt.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1"))
410 | 
411 |         if show_effect:
412 |             arrow_width = 0.025
413 |             arrow_height = 0.3
414 |             arrow_dr = 0.125
415 |             arrow_dt = 0.2
416 |             beta_col = [i for i in pval_df.columns if i in ('slope', 'beta', 'effect_size')]
417 |             # assert len(beta_col) == 1, f"No effect size found"
418 |             if len(beta_col) == 1:
419 |                 beta_col = beta_col[0]
420 |                 beta = pval_df.loc[variant_id, beta_col]
421 |                 if beta > 0:
422 |                     ax.arrow(1-arrow_dr/aw, 1-(arrow_height+arrow_dt)/ah, 0, arrow_height/ah,
423 |                              head_length=0.1/ah, width=arrow_width/aw,
424 |                              ec='none', fc='tab:green', transform=ax.transAxes)
425 |                     ax.text(1-arrow_dr*1.66/aw, 1-(arrow_height/2+arrow_dt)/ah, r"$\beta$", va='center', ha='center', transform=ax.transAxes)
426 |                 else:
427 |                     ax.arrow(1-arrow_dr/aw, 1-(arrow_dt-0.1)/ah, 0, -arrow_height/ah,
428 |                              head_length=0.1/ah, width=arrow_width/aw,
429 |                              ec='none', fc='tab:red', transform=ax.transAxes)
430 |                     ax.text(1-arrow_dr*1.66/aw, 1-(arrow_dt-0.1+arrow_height/2)/ah, r"$\beta$", va='center', ha='center', transform=ax.transAxes)
431 | 
432 |         ax.margins(y=0.2)
433 |         if ymax is not None and isinstance(ymax, Iterable):
434 |             if ymax[k] is not None:
435 |                 ax.set_ylim([0, ymax[k]])
436 |             else:
437 |                 ax.set_ylim([0, ax.get_ylim()[1]])
438 |         else:
439 |             if 'pip' in pval_df:
440 |                 ax.set_ylim([0, ax.get_ylim()[1]])
441 |             elif ymax is None:
442 |                 ax.set_ylim([0, np.maximum(ax.get_ylim()[1], miny)])
443 |             else:
444 |                 ax.set_ylim([0, ymax])
445 | 
446 |         if shade_range is not None:  # highlight subregion with gray background
447 |             ax.add_patch(patches.Rectangle((shade_range[0], 0), np.diff(shade_range)[0], ax.get_ylim()[1], facecolor=shade_color, zorder=-10))
448 | 
449 |     if labels is not None:
450 |         if label_pos == 'left':
451 |             for ax,t in zip(axes, labels):
452 |                 ax.text(0.02, 0.925, t, transform=ax.transAxes, va='top', ha='left', fontsize=label_fontsize)
453 |         elif label_pos == 'right':
454 |             for ax,t in zip(axes, labels):
455 |                 ax.text(0.98, 0.925, t, transform=ax.transAxes, va='top', ha='right', fontsize=label_fontsize)
456 | 
457 |     if single_ylabel:
458 |         # for ax in axes:
459 |         #     x.set_ylabel(None)
460 |         m = db + (n*ah + (n-1)*ds)/2
461 |         fig.text(0.035, m/fh, '-log$\mathregular{_{10}}$(p-value)', va='center', rotation=90, fontsize=14);
462 |     else:
463 |         for k,ax in enumerate(axes):
464 |             ax.set_ylabel(ylabels[k], fontsize=12)#, labelpad=15)
465 |             # if 'p-value' in ylabels[k]:
466 |             #     ax.yaxis.set_label_coords(-0.07*4/aw, 0.5)
467 | 
468 |     for ax in axes:
469 |         ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=3, nbins=4))
470 |     axes[0].set_title(title, fontsize=12)
471 | 
472 |     if chr_label_pos == 'bottom':
473 |         v = axes
474 |     else:
475 |         v = axes[1:]
476 |     if tracks is not None:
477 |         v += [tax]
478 |     for ax in v:
479 |         plt.setp(ax.get_xticklabels(), visible=False)
480 |         for line in ax.xaxis.get_ticklines():
481 |             line.set_markersize(0)
482 |             line.set_markeredgewidth(0)
483 | 
484 |     if sharey is not None:  # force equal y limits
485 |         shared_max = 0
486 |         for k in sharey:
487 |             y = axes[k-1].get_ylim()[1]
488 |             if y > shared_max:
489 |                 shared_max = y
490 |         for k in sharey:
491 |             axes[k-1].set_ylim([0, shared_max])
492 | 
493 |     if tracks is not None and len(tracks) > 0:  # plot, e.g., ATAC-seq tracks
494 |         ntracks = tracks.shape[1]
495 |         x = tracks.index
496 |         maxv = tracks.max().max()
497 |         for k, label in enumerate(tracks):
498 |             y0 = (ntracks-1-k) * np.ones(len(x))  # vertical offset
499 |             if track_colors is not None and label in track_colors:
500 |                 color = track_colors[label]
501 |             else:
502 |                 color = 'k'
503 |             c = tracks[label]
504 |             tax.fill_between(x, 0.95*c/maxv + y0, y0,
505 |                              antialiased=False, linewidth=1, facecolor=color,
506 |                              clip_on=True, rasterized=True)
507 |         tax.set_yticks(np.arange(ntracks))
508 |         tax.set_yticklabels(tracks.columns[::-1], fontsize=9, va='bottom')
509 |         for line in tax.yaxis.get_ticklines():
510 |             line.set_markersize(0)
511 |             line.set_markeredgewidth(0)
512 |         for i in ['top', 'bottom', 'right', 'left']:
513 |             tax.spines[i].set_visible(False)
514 |         tax.set_ylim([0, ntracks])
515 | 
516 |     if gene[0] is None or gene[0].chr != chrom:
517 |         axes[-1].xaxis.tick_bottom()
518 |         axes[-1].xaxis.set_label_position('bottom')
519 |         axes[-1].spines['bottom'].set_visible(True)
520 |         axes[-1].tick_params(axis='x', pad=2)
521 |         axes[-1].xaxis.labelpad = 8
522 |         axes[-1].set_xlabel(f'Position on {chrom} (Mb)', fontsize=12)
523 |         xt = axes[-1].get_xticks()
524 |         axes[-1].set_xticks(xt)
525 |         axes[-1].set_xticklabels(xt/1e6)
526 |         axes[-1].set_xlim(xlim)
527 |     else:  # add gene model
528 |         #  plot gene model and annotate
529 |         if gene[0].end_pos < xlim[0]:
530 |             x = gh/aw/2
531 |             v = np.array([[x,0.2], [x-0.8*gh/aw, 0.5], [x,0.8]])
532 |             polygon = patches.Polygon(v, closed=True, color='k', transform=gax.transAxes, clip_on=False)
533 |             gax.add_patch(polygon)
534 |             txt = f'{gene[0].name} (~{(pos-gene[0].tss)/1e6:.1f}Mb)'
535 |             gax.set_ylim([-1,1])
536 |             gax.text(1.5*x, 0.5, txt, va='center', ha='left', transform=gax.transAxes)
537 |         elif gene[0].start_pos > xlim[1]:
538 |             x = 1 - gh/aw/2
539 |             v = np.array([[x,0.2], [x+0.8*gh/aw, 0.5], [x,0.8]])
540 |             polygon = patches.Polygon(v, closed=True, color='k', transform=gax.transAxes, clip_on=False)
541 |             gax.add_patch(polygon)
542 |             txt = f'{gene[0].name} (~{(gene[0].tss-pos)/1e6:.1f}Mb)'
543 |             gax.set_ylim([-1,1])
544 |             gax.text(1 - gh/aw/2*1.5, 0.5, txt, va='center', ha='right', transform=gax.transAxes)
545 |         else:
546 |             m = np.mean(xlim)
547 |             for k,g in enumerate(gene):
548 |                 if g is not None:
549 |                     g = g.collapse()
550 |                     y = len(gene)-1-k  # revert position
551 |                     g.plot(ax=gax, yoffset=y, max_intron=1e9, pc_color='k', nc_color='k', ec='none', wx=0.1, scale=gene_scale, ylabels=None, clip_on=True)
552 |                     if gene_label_pos is None:
553 |                         if g.tss - m > m - g.tss:
554 |                             gax.annotate(g.name, (np.minimum(g.end_pos, xlim[1]), y), xytext=(5,0), textcoords='offset points', va='center', ha='left')
555 |                         else:
556 |                             gax.annotate(g.name, (np.maximum(g.start_pos, xlim[0]), y), xytext=(-5,0), textcoords='offset points', va='center', ha='right')
557 |                     elif gene_label_pos == 'left':
558 |                         gax.annotate(g.name, (np.maximum(g.start_pos, xlim[0]), y), xytext=(-5,0), textcoords='offset points', va='center', ha='right')
559 |                     elif gene_label_pos == 'right':
560 |                         gax.annotate(g.name, (np.minimum(g.end_pos, xlim[1]), y), xytext=(5,0), textcoords='offset points', va='center', ha='left')
561 |             gax.set_ylim([-0.5, len(gene)-0.5])
562 | 
563 |         if chr_label_pos == 'bottom':
564 |             gax.set_xlabel(f'Position on {chrom} (Mb)', fontsize=12)
565 |         else:
566 |             plt.setp(gax.get_xticklabels(), visible=False)
567 |             for line in gax.xaxis.get_ticklines():
568 |                 line.set_markersize(0) # tick length
569 |                 line.set_markeredgewidth(0) # tick line width
570 |             gax.spines['bottom'].set_visible(False)
571 | 
572 |         gax.set_yticks([])
573 |         gax.set_yticklabels([])
574 |         gax.spines['top'].set_visible(False)
575 |         gax.spines['left'].set_visible(False)
576 |         gax.spines['right'].set_visible(False)
577 |         gax.set_title('')
578 |         xt = gax.get_xticks()
579 |         gax.set_xticks(xt)
580 |         gax.set_xticklabels(xt/1e6)
581 |         gax.set_xlim(xlim)
582 |         axes.append(gax)
583 | 
584 |     if chr_label_pos != 'bottom':
585 |         axes[0].xaxis.tick_top()
586 |         axes[0].xaxis.set_label_position('top')
587 |         axes[0].set_xlabel(f'Position on {chrom} (Mb)', fontsize=12)
588 |         axes[0].spines['top'].set_visible(True)
589 |         axes[0].tick_params(axis='x', pad=2)
590 |         axes[0].xaxis.labelpad = 8
591 | 
592 |     for ax in axes:
593 |         ax.set_facecolor('none')
594 | 
595 |     # for i in range(len(pvals)):
596 |     #     axes[i].get_yaxis().set_label_coords(-0.12,0.5)
597 | 
598 |     return axes
599 | 
600 | 
601 | def plot_ieqtl_locus(eqtl_df, ieqtl_df, gwas_df, r2_s, gene_id, variant_id, annot,
602 |                      independent_df=None, rs_id=None, trait_name=None, pp4=None, window=200000,
603 |                      aw=4, ah=1.25):
604 | 
605 |     pvals = [
606 |         gwas_df.rename(columns={'pvalue':'pval_nominal'}),
607 |         eqtl_df.loc[eqtl_df.index.isin(r2_s.index)],
608 |         ieqtl_df.loc[ieqtl_df.index.isin(r2_s.index)]
609 |     ]
610 | 
611 |     if trait_name is None:
612 |         trait_name = 'GWAS'
613 | 
614 |     labels = [trait_name]
615 |     if pp4 is None:
616 |         labels.extend(['eQTL', 'ieQTL'])
617 |     else:
618 |         labels.extend([f'eQTL (PP4 = {pp4[0]:.2f})', f'ieQTL (PP4 = {pp4[1]:.2f})'])
619 | 
620 |     plot_locus(pvals, variant_ids=variant_id, r2_s=r2_s, gene=annot.gene_dict[gene_id], rs_id=rs_id,
621 |                highlight_ids=None, aw=aw, ah=ah,
622 |                labels=labels, shade_range=None, gene_label_pos='right', chr_label_pos='bottom', window=window)
623 | 
624 | 
625 | 
626 | if __name__ == '__main__':
627 |     mpl.use('Agg')
628 | 
629 |     parser = argparse.ArgumentParser(description='locus plot')
630 |     parser.add_argument('--eqtl', required=True, help='QTL summary statistics file containing all pairwise associations')
631 |     parser.add_argument('--ieqtl', required=True, help='iQTL summary statistics file containing all pairwise associations')
632 |     parser.add_argument('--gwas', required=True, help='GWAS summary statistics file')
633 |     parser.add_argument('--vcf', required=True, help='VCF file')
634 |     parser.add_argument('--phenotype_bed', required=True, help='Phenotype BED file used for QTL mapping (required for parsing sample IDs)')
635 |     parser.add_argument('--gene_id', required=True, help='Gene ID')
636 |     parser.add_argument('--gtf', required=True, help='Gene annotation in GTF format')
637 |     parser.add_argument('--variant_id', help='Variant ID')
638 |     parser.add_argument('--phenotype_id', default=None, help='Select p-values for a specific phenotype, e.g., for sQTLs')
639 |     parser.add_argument('--rs_id', help='')
640 |     parser.add_argument('--id_lookup_table', help='Lookup table mapping variant IDs to rs IDs (rs ID must be in last column)')
641 |     parser.add_argument('--window', default=200000, type=int, help='')
642 |     parser.add_argument('--labels', nargs='+', default=None)
643 |     parser.add_argument('--ymax', nargs='+', type=np.float64, default=None)
644 |     parser.add_argument('--sharey', nargs='+', type=int, help='Use same y-axis for the specified plots (1-indexed, with top plot starting at 1.)', default=None)
645 |     parser.add_argument('--top_variant', default='ieQTL', choices=['GWAS', 'eQTL', 'ieQTL'])
646 |     parser.add_argument('--output_dir', default='.', type=str, help='')
647 |     args = parser.parse_args()
648 | 
649 |     print('Loading gene annotation')
650 |     annot = annotation.Annotation(args.gtf)
651 |     gene = annot.gene_dict[args.gene_id]
652 |     chrom = gene.chr
653 | 
654 |     if args.phenotype_id is not None:
655 |         load_id = args.phenotype_id
656 |     else:
657 |         load_id = args.gene_id
658 | 
659 |     print('Loading eQTL summary statistics')
660 |     eqtl_df = load_eqtl(args.eqtl, load_id, chrom)
661 | 
662 |     print('Loading ieQTL summary statistics')
663 |     ieqtl_df = load_eqtl(args.ieqtl, load_id, chrom)
664 |     if not np.all(ieqtl_df.index.isin(eqtl_df.index)):
665 |         print('WARNING: ieQTL results contain variants not present in eQTL results')
666 | 
667 |     print('Loading GWAS summary statistics')
668 |     gwas_df = load_gwas(args.gwas, eqtl_df.index)
669 | 
670 |     if args.variant_id is None:
671 |         common_ix = ieqtl_df.index[ieqtl_df.index.isin(eqtl_df.index) & ieqtl_df.index.isin(gwas_df.index)]
672 |         if args.top_variant == 'ieQTL':
673 |             variant_id = ieqtl_df.loc[common_ix, 'pval_nominal'].idxmin()
674 |         elif args.top_variant == 'eQTL':
675 |             variant_id = eqtl_df.loc[common_ix, 'pval_nominal'].idxmin()
676 |         else:
677 |             variant_id = gwas_df.loc[common_ix, 'pval_nominal'].idxmin()
678 |     else:
679 |         variant_id = args.variant_id
680 |     chrom, pos, ref, alt, _ = variant_id.split('_')
681 |     pos = int(pos)
682 | 
683 |     print('Loading genotypes and computing LD')
684 |     r2_s = get_ld(args.vcf, variant_id, args.phenotype_bed)
685 | 
686 |     rs_id = args.rs_id
687 |     if rs_id is None and args.id_lookup_table is not None:
688 |         print('Parsing rsID lookup table')
689 |         rs_id = get_rsid(args.id_lookup_table, variant_id)
690 | 
691 |     print('Generating plot')
692 |     plot_locus([gwas_df, eqtl_df, ieqtl_df], args.gene_id, variant_id, annot, r2_s=r2_s,
693 |                rs_id=rs_id, labels=[i.encode('utf-8').decode('unicode_escape') for i in args.labels],
694 |                ymax=args.ymax, sharey=args.sharey,
695 |                window=args.window, shared_only=True)
696 | 
697 |     pdf_file = os.path.join(args.output_dir, f'{gene.name}.{variant_id}.locus_plot.pdf')
698 |     plt.savefig(pdf_file)
699 | 
700 |     print('Done.')
701 | 


--------------------------------------------------------------------------------
/qtl/map.py:
--------------------------------------------------------------------------------
  1 | """qtl.map: functions for mapping QTLs"""
  2 | 
  3 | __author__ = "Francois Aguet"
  4 | __copyright__ = "Copyright 2018-2020, The Broad Institute"
  5 | __license__ = "BSD3"
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import scipy.stats
 10 | import re
 11 | from . import stats
 12 | from . import genotype as gt
 13 | from . import locusplot
 14 | try:
 15 |     import rpy2
 16 |     has_rpy2 = True
 17 | except:
 18 |     has_rpy2 = False
 19 | 
 20 | 
 21 | def calculate_association(genotype, phenotype_s, covariates_df=None, impute=True, logp=False):
 22 |     """Compute genotype-phenotype associations"""
 23 |     if logp and not has_rpy2:
 24 |         raise ValueError("The rpy2 package is required to compute log p-values.")
 25 |     if isinstance(genotype, pd.Series):
 26 |         genotype_df = genotype.to_frame().T
 27 |     elif isinstance(genotype, pd.DataFrame):
 28 |         genotype_df = genotype
 29 |     else:
 30 |         raise ValueError('Input type not supported')
 31 | 
 32 |     # assert np.all(genotype_df.columns==phenotype_s.index)
 33 |     if covariates_df is not None:
 34 |         assert covariates_df.index.equals(genotype_df.columns)
 35 | 
 36 |     # impute missing genotypes
 37 |     if impute:
 38 |         gt.impute_mean(genotype_df, verbose=False)
 39 | 
 40 |     # residualize genotypes and phenotype
 41 |     if covariates_df is not None:
 42 |         r = stats.Residualizer(covariates_df)
 43 |         gt_res_df = r.transform(genotype_df)
 44 |         p_res_s = r.transform(phenotype_s)
 45 |         num_covar = covariates_df.shape[1]
 46 |     else:
 47 |         gt_res_df = genotype_df
 48 |         p_res_s = phenotype_s
 49 |         num_covar = 0
 50 | 
 51 |     if isinstance(p_res_s, pd.Series):
 52 |         n = p_res_s.std() / gt_res_df.std(axis=1)
 53 |     else:
 54 |         n = p_res_s.std(axis=1) / gt_res_df.std(axis=1).values
 55 | 
 56 |     gt_res_df = stats.center_normalize(gt_res_df, axis=1)
 57 |     if isinstance(p_res_s, pd.Series):
 58 |         p_res_s = stats.center_normalize(p_res_s)
 59 |     else:
 60 |         p_res_s = stats.center_normalize(p_res_s, axis=1)
 61 | 
 62 |     if isinstance(p_res_s, pd.Series):
 63 |         r = gt_res_df.dot(p_res_s)
 64 |     else:  # single genotype x phenotypes
 65 |         r = gt_res_df.dot(p_res_s.T).squeeze()
 66 |     dof = gt_res_df.shape[1] - 2 - num_covar
 67 |     tstat = r * np.sqrt(dof/(1-r*r))
 68 | 
 69 |     if not logp:
 70 |         pval = 2*scipy.stats.t.cdf(-np.abs(tstat), dof)
 71 |     else:
 72 |         r_pt = rpy2.robjects.r['pt']
 73 |         rt = rpy2.robjects.vectors.FloatVector(-np.abs(tstat))
 74 |         pval = -(np.array(r_pt(rt, dof, lower_tail=True, log=True)) + np.log(2)) * np.log10(np.e)
 75 | 
 76 |     df = pd.DataFrame(pval, index=tstat.index, columns=['pval_nominal'])
 77 |     df['slope'] = r * n
 78 |     df['slope_se'] = df['slope'] / tstat
 79 |     df['corr_r2'] = r*r
 80 |     df['tstat'] = tstat
 81 |     n2 = 2 * genotype_df.shape[1]
 82 |     af = genotype_df.sum(1) / n2
 83 |     if isinstance(p_res_s, pd.Series):
 84 |         df['af'] = af
 85 |     else:
 86 |         assert len(af) == 1
 87 |         df['af'] = af.values[0]
 88 |     ix = df['af'] <= 0.5
 89 |     m = genotype_df > 0.5
 90 |     a = m.sum(1).astype(int)
 91 |     b = (genotype_df < 1.5).sum(1).astype(int)
 92 |     df['ma_samples'] = np.where(ix, a, b)
 93 |     a = (genotype_df * m).sum(1).round().astype(int)  # round for missing/imputed genotypes
 94 |     df['ma_count'] = np.where(ix, a, n2-a)
 95 |     if isinstance(genotype, pd.DataFrame):
 96 |         if logp:
 97 |             df['r2'] = locusplot.compute_ld(genotype, df['pval_nominal'].idxmax())
 98 |         else:
 99 |             df['r2'] = locusplot.compute_ld(genotype, df['pval_nominal'].idxmin())
100 | 
101 |     # if isinstance(df.index[0], str) and '_' in df.index[0]:  # assume variant IDs in format chr_pos_ref_alt_build
102 |     if isinstance(df.index[0], str) and len(re.findall("^(?:chr)?\w_?\d+_", df.index[0])) == 1:
103 |         df['chr'] = df.index.map(lambda x: x.split('_')[0])
104 |         df['position'] = df.index.map(lambda x: int(x.split('_')[1]))
105 |     if isinstance(p_res_s, pd.Series):
106 |         df.index.name = 'variant_id'
107 |     else:
108 |         df.index.name = 'phenotype_id'
109 |     m = df['pval_nominal'] == 0
110 |     if any(m):
111 |         e = np.nextafter(0, 1)  # np.finfo(np.float64).tiny
112 |         print(f"Warning: underflow detected (setting to {e}), use logp=True to compute p-values as -log10(P).")
113 |         df.loc[m, 'pval_nominal'] = e
114 |     return df
115 | 
116 | 
117 | def map_pairs(genotype_df, phenotype_df, covariates_df=None, impute=True):
118 |     """Calculates association statistics for arbitrary phenotype-variant pairs"""
119 |     assert genotype_df.shape[0] == phenotype_df.shape[0]
120 |     assert genotype_df.columns.equals(phenotype_df.columns)
121 |     assert genotype_df.columns.equals(covariates_df.index)
122 |     if impute:
123 |         gt.impute_mean(genotype_df, verbose=False)
124 | 
125 |     # residualize genotypes and phenotype
126 |     if covariates_df is not None:
127 |         r = stats.Residualizer(covariates_df)
128 |         gt_res_df = r.transform(genotype_df)
129 |         p_res_df = r.transform(phenotype_df)
130 |         num_covar = covariates_df.shape[1]
131 |     else:
132 |         gt_res_df = genotype_df
133 |         p_res_df = phenotype_df
134 |         num_covar = 0
135 | 
136 |     n = p_res_df.std(axis=1).values / gt_res_df.std(axis=1).values
137 | 
138 |     gt_res_df = stats.center_normalize(gt_res_df, axis=1)
139 |     p_res_df = stats.center_normalize(p_res_df, axis=1)
140 | 
141 |     r = np.sum(gt_res_df.values * p_res_df.values, axis=1)
142 |     dof = gt_res_df.shape[1] - 2 - num_covar
143 | 
144 |     tstat2 = dof*r*r / (1-r*r)
145 |     pval = scipy.stats.f.sf(tstat2, 1, dof)
146 | 
147 |     df = pd.DataFrame({'phenotype_id':phenotype_df.index, 'variant_id':genotype_df.index, 'pval_nominal':pval})
148 |     df['slope'] = r * n
149 |     df['slope_se'] = df['slope'].abs() / np.sqrt(tstat2)
150 |     df['af'] = genotype_df.sum(1).values / (2*genotype_df.shape[1])
151 |     df['maf'] = np.where(df['af'] <= 0.5, df['af'], 1-df['af'])
152 |     return df
153 | 
154 | 
155 | def calculate_interaction(genotype_s, phenotype_s, interaction_s, covariates_df=None, impute=True):
156 | 
157 |     assert genotype_s.index.equals(interaction_s.index)
158 | 
159 |     # impute missing genotypes
160 |     if impute:
161 |         gt.impute_mean(genotype_s, verbose=False)
162 | 
163 |     # interaction term
164 |     gi = genotype_s * interaction_s
165 | 
166 |     # center
167 |     g0 = genotype_s - genotype_s.mean()
168 |     gi0 = gi - gi.mean()
169 |     i0 = interaction_s - interaction_s.mean()
170 |     p0 = phenotype_s - phenotype_s.mean()
171 | 
172 |     dof = phenotype_s.shape[0] - 4
173 |     # residualize
174 |     if covariates_df is not None:
175 |         r = stats.Residualizer(covariates_df)
176 |         g0 =  r.transform(g0.values.reshape(1,-1), center=False)
177 |         gi0 = r.transform(gi0.values.reshape(1,-1), center=False)
178 |         p0 =  r.transform(p0.values.reshape(1,-1), center=False)
179 |         i0 =  r.transform(i0.values.reshape(1,-1), center=False)
180 |         dof -= covariates_df.shape[1]
181 |     else:
182 |         g0 = g0.values.reshape(1,-1)
183 |         gi0 = gi0.values.reshape(1,-1)
184 |         p0 = p0.values.reshape(1,-1)
185 |         i0 = i0.values.reshape(1,-1)
186 | 
187 |     # regression
188 |     X = np.r_[g0, i0, gi0].T
189 |     Xinv = np.linalg.inv(np.dot(X.T, X))
190 |     b = np.dot(np.dot(Xinv, X.T), p0.reshape(-1,1))
191 |     r = np.squeeze(np.dot(X, b)) - p0
192 |     rss = np.sum(r*r)
193 |     b_se = np.sqrt(np.diag(Xinv) * rss / dof)
194 |     b = np.squeeze(b)
195 |     tstat = b / b_se
196 |     pval = 2*scipy.stats.t.cdf(-np.abs(tstat), dof)
197 | 
198 |     return pd.Series({
199 |         'b_g':b[0], 'b_g_se':b_se[0], 'pval_g':pval[0],
200 |         'b_i':b[1], 'b_i_se':b_se[1], 'pval_i':pval[1],
201 |         'b_gi':b[2],'b_gi_se':b_se[2],'pval_gi':pval[2],
202 |     })#, r[0]
203 | 
204 | 
205 | def compute_ld(genotype_df, variant_id):
206 |     """Compute LD (r2)"""
207 |     # return gt_df.corrwith(gt_df.loc[variant_id], axis=1, method='pearson')**2
208 |     g0 = genotype_df - genotype_df.values.mean(1, keepdims=True)
209 |     d = (g0**2).sum(1) * (g0.loc[variant_id]**2).sum()
210 |     return (g0 * g0.loc[variant_id]).sum(1)**2 / d
211 | 
212 | 
213 | def get_conditional_pvalues(group_df, genotypes, phenotype_df, covariates_df,
214 |                             phenotype_id=None, window=200000, maf_threshold=0):
215 |     """
216 |     Compute conditional p-values for a set of variants defined in group_df
217 | 
218 |     group_df : dataframe with columns 'variant_id' and 'phenotype_id'
219 |     genotypes : pd.DataFrame or qtl.genotype.GenotypeIndexer
220 |     phenotype_df : pd.DataFrame
221 |     covariates_df : pd.DataFrame
222 |     """
223 |     assert phenotype_df.columns.equals(covariates_df.index)
224 |     variant_id = group_df['variant_id'].iloc[0]
225 | 
226 |     if isinstance(genotypes, gt.GenotypeIndexer):
227 |         gt_df = genotypes.get_genotype_window(variant_id, window=window)
228 |     elif isinstance(genotypes, pd.DataFrame):
229 |         gt_df = genotypes
230 |     else:
231 |         raise ValueError('Unsupported input format')
232 | 
233 |     maf = gt_df.sum(1) / (2*gt_df.shape[1])
234 |     maf = np.where(maf<=0.5, maf, 1-maf)
235 | 
236 |     gt_df = gt_df[maf >= maf_threshold]
237 | 
238 |     res = []
239 |     if phenotype_id is not None:
240 |         pval_df = calculate_association(gt_df, phenotype_df.loc[phenotype_id], covariates_df=covariates_df)
241 |         pval_df['r2'] = compute_ld(gt_df, variant_id)
242 |         res.append(pval_df)
243 | 
244 |     for k,(variant_id, phenotype_id) in enumerate(zip(group_df['variant_id'], group_df['phenotype_id']), 1):
245 |         print(f'\rProcessing {k}/{group_df.shape[0]}', end='')
246 |         covariates = pd.concat([covariates_df, gt_df.loc[np.setdiff1d(group_df['variant_id'], variant_id)].T], axis=1)
247 |         pval_df = calculate_association(gt_df, phenotype_df.loc[phenotype_id], covariates_df=covariates)
248 |         pval_df['r2'] = compute_ld(gt_df, variant_id)
249 | 
250 |         res.append(pval_df)
251 |     return res
252 | 


--------------------------------------------------------------------------------
/qtl/norm.py:
--------------------------------------------------------------------------------
  1 | # Author: Francois Aguet
  2 | import numpy as np
  3 | import pandas as pd
  4 | import scipy.stats as stats
  5 | import warnings
  6 | 
  7 | 
  8 | #--------------------------------------
  9 | #  eQTL expression normalization
 10 | #--------------------------------------
 11 | def normalize_quantiles(df):
 12 |     """
 13 |     Quantile normalization to the average empirical distribution
 14 |     Note: replicates behavior of R function normalize.quantiles
 15 |           from library("preprocessCore")
 16 | 
 17 |     Reference:
 18 |      [1] Bolstad et al., Bioinformatics 19(2), pp. 185-193, 2003
 19 | 
 20 |     Adapted from https://github.com/andrewdyates/quantile_normalize
 21 |     """
 22 |     M = df.values.copy()
 23 | 
 24 |     Q = M.argsort(axis=0)
 25 |     m,n = M.shape
 26 | 
 27 |     # compute quantile vector
 28 |     quantiles = np.zeros(m)
 29 |     for i in range(n):
 30 |         quantiles += M[Q[:,i],i]
 31 |     quantiles = quantiles / n
 32 | 
 33 |     for i in range(n):
 34 |         # Get equivalence classes; unique values == 0
 35 |         dupes = np.zeros(m, dtype=np.int64)
 36 |         for j in range(m-1):
 37 |             if M[Q[j,i],i] == M[Q[j+1,i],i]:
 38 |                 dupes[j+1] = dupes[j]+1
 39 | 
 40 |         # Replace column with quantile ranks
 41 |         M[Q[:,i],i] = quantiles
 42 | 
 43 |         # Average together equivalence classes
 44 |         j = m-1
 45 |         while j >= 0:
 46 |             if dupes[j] == 0:
 47 |                 j -= 1
 48 |             else:
 49 |                 idxs = Q[j-dupes[j]:j+1,i]
 50 |                 M[idxs,i] = np.median(M[idxs,i])
 51 |                 j -= 1 + dupes[j]
 52 |         assert j == -1
 53 | 
 54 |     return pd.DataFrame(M, index=df.index, columns=df.columns)
 55 | 
 56 | 
 57 | def inverse_normal_transform(M):
 58 |     """Transform rows to a standard normal distribution"""
 59 |     if isinstance(M, pd.Series):
 60 |         r = stats.rankdata(M)
 61 |         return pd.Series(stats.norm.ppf(r/(M.shape[0]+1)), index=M.index, name=M.name)
 62 |     else:
 63 |         R = stats.rankdata(M, axis=1)  # ties are averaged
 64 |         Q = stats.norm.ppf(R/(M.shape[1]+1))
 65 |         if isinstance(M, pd.DataFrame):
 66 |             Q = pd.DataFrame(Q, index=M.index, columns=M.columns)
 67 |         return Q
 68 | 
 69 | #--------------------------------------
 70 | #  DESeq size factor normalization
 71 | #--------------------------------------
 72 | def deseq2_size_factors(counts_df):
 73 |     """
 74 |     Calculate DESeq size factors
 75 |     median of ratio to reference sample (geometric mean of all samples)
 76 | 
 77 |     References:
 78 |      [1] Anders & Huber, 2010
 79 |      [2] R functions:
 80 |           DESeq::estimateSizeFactorsForMatrix
 81 |     """
 82 |     idx = np.all(counts_df>0, axis=1)
 83 |     tmp_df = np.log(counts_df.loc[idx.values])
 84 |     s = np.exp(np.median(tmp_df.T - np.mean(tmp_df, axis=1), axis=1))
 85 |     return s
 86 | 
 87 | 
 88 | def deseq2_normalized_counts(counts_df):
 89 |     """
 90 |     Equivalent to DESeq2:::counts.DESeqDataSet; counts(x, normalized=T)
 91 |     """
 92 |     return counts_df / deseq2_size_factors(counts_df)
 93 | 
 94 | 
 95 | def deseq2_cpm(counts_df):
 96 |     """Calculate CPM normalized by DESeq size factors"""
 97 |     cpm_df = counts_df/counts_df.sum(axis=0)*1e6
 98 |     s = deseq2_size_factors(cpm_df)
 99 |     return cpm_df / s
100 | 
101 | #--------------------------------------
102 | #  edgeR TMM normalization
103 | #--------------------------------------
104 | def edger_calcnormfactors(counts_df, ref=None, logratio_trim=0.3,
105 |                           sum_trim=0.05, acutoff=-1e10, verbose=False):
106 |     """
107 |     Calculate TMM (Trimmed Mean of M values) normalization.
108 |     Reproduces edgeR::calcNormFactors.default
109 | 
110 |     Scaling factors for the library sizes that minimize
111 |     the log-fold changes between the samples for most genes.
112 | 
113 |     Effective library size: TMM scaling factor * library size
114 | 
115 |     References:
116 |      [1] Robinson & Oshlack, 2010
117 |      [2] R functions:
118 |           edgeR::calcNormFactors.default
119 |           edgeR:::.calcFactorWeighted
120 |           edgeR:::.calcFactorQuantile
121 |     """
122 | 
123 |     # discard genes with all-zero counts
124 |     Y = counts_df.values.copy()
125 |     allzero = np.sum(Y>0,axis=1)==0
126 |     if np.any(allzero):
127 |         Y = Y[~allzero,:]
128 | 
129 |     # select reference sample
130 |     if ref is None:  # reference sample index
131 |         f75 = np.percentile(Y/np.sum(Y,axis=0), 75, axis=0)
132 |         ref = np.argmin(np.abs(f75-np.mean(f75)))
133 |         if verbose:
134 |             print('Reference sample index: '+str(ref))
135 | 
136 |     N = np.sum(Y, axis=0)  # total reads in each library
137 | 
138 |     # with np.errstate(divide='ignore'):
139 |     with warnings.catch_warnings():
140 |         warnings.simplefilter('ignore')
141 |         # log fold change; Mg in [1]
142 |         logR = np.log2((Y/N).T / (Y[:,ref]/N[ref])).T
143 |         # average log relative expression; Ag in [1]
144 |         absE = 0.5*(np.log2(Y/N).T + np.log2(Y[:,ref]/N[ref])).T
145 |         v = (N-Y)/N/Y
146 |         v = (v.T + v[:,ref]).T  # w in [1]
147 | 
148 |     ns = Y.shape[1]
149 |     tmm = np.zeros(ns)
150 |     for i in range(ns):
151 |         fin = np.isfinite(logR[:,i]) & np.isfinite(absE[:,i]) & (absE[:,i] > acutoff)
152 |         n = np.sum(fin)
153 | 
154 |         loL = np.floor(n*logratio_trim)+1
155 |         hiL = n + 1 - loL
156 |         loS = np.floor(n*sum_trim)+1
157 |         hiS = n + 1 - loS
158 |         rankR = stats.rankdata(logR[fin,i])
159 |         rankE = stats.rankdata(absE[fin,i])
160 |         keep = (rankR >= loL) & (rankR <= hiL) & (rankE >= loS) & (rankE <= hiS)
161 |         # in [1], w erroneously defined as 1/v ?
162 |         tmm[i] = 2**(np.nansum(logR[fin,i][keep]/v[fin,i][keep]) / np.nansum(1/v[fin,i][keep]))
163 | 
164 |     tmm = tmm / np.exp(np.mean(np.log(tmm)))
165 |     return tmm
166 | 
167 | 
168 | def edger_cpm_default(counts_df, lib_size=None, log=False, prior_count=0.25):
169 |     """
170 |     edgeR normalized counts
171 | 
172 |     Reproduces edgeR::cpm.default
173 |     """
174 |     if lib_size is None:
175 |         lib_size = counts_df.sum(axis=0)
176 |     if log:
177 |         prior_count_scaled = lib_size/np.mean(lib_size) * prior_count
178 |         lib_size <- lib_size + 2 * prior_count_scaled
179 |     lib_size = 1e-6 * lib_size
180 |     if log:
181 |         return np.log2((counts_df + prior_count_scaled)/lib.size)
182 |     else:
183 |         return counts_df / lib_size
184 | 
185 | 
186 | def edger_cpm(counts_df, tmm=None, normalized_lib_sizes=True):
187 |     """
188 |     Return edgeR normalized/rescaled CPM (counts per million)
189 | 
190 |     Reproduces edgeR::cpm.DGEList
191 |     """
192 |     lib_size = counts_df.sum(axis=0)
193 |     if normalized_lib_sizes:
194 |         if tmm is None:
195 |             tmm = edger_calcnormfactors(counts_df)
196 |         lib_size = lib_size * tmm
197 |     return counts_df / lib_size * 1e6
198 | 
199 | #--------------------------------------
200 | #  limma-voom functions
201 | #--------------------------------------
202 | def voom_transform(counts_df):
203 |     """Apply counts transformation from limma-voom"""
204 |     lib_size = counts_df.sum(0)
205 |     norm_factors = edger_calcnormfactors(counts_df)
206 |     return np.log2((counts_df + 0.5) / (lib_size*norm_factors + 1) * 1e6)
207 | 
208 | #--------------------------------------
209 | #  PoissonSeq size factor normalization
210 | #--------------------------------------
211 | def poissonseq_size_factors(counts_df, maxiter=10):
212 |     """
213 |     PoissonSeq normalization from Li et al., Biostatistics, 2012
214 |     """
215 |     gsum = counts_df.sum(1)
216 | 
217 |     # initialize
218 |     ix = counts_df.index
219 |     libsize = counts_df.sum(0)
220 |     d_est = libsize / libsize.sum()
221 | 
222 |     # v = [d_est]
223 |     i = 0
224 |     meandiff = 1
225 |     while i<maxiter and meandiff>1e-10:
226 |         d = np.outer(gsum, d_est)
227 |         gof = ((counts_df - d).pow(2) / d).sum(1)
228 |         lb, ub = np.percentile(gof, [25,75])
229 |         ix = gof[(lb<=gof) & (gof<=ub)].index
230 |         d_est0 = d_est
231 |         d_est = counts_df.loc[ix].sum(0) / gsum.loc[ix].sum()
232 |         meandiff = (d_est - d_est0).pow(2).sum() / counts_df.shape[1]
233 |         i += 1
234 |         # print(meandiff)
235 |         # v.append(d_est)
236 |     return d_est
237 | 


--------------------------------------------------------------------------------
/qtl/pca.py:
--------------------------------------------------------------------------------
 1 | """qtl.pca: helper functions for PCA of expression data"""
 2 | 
 3 | __author__ = "Francois Aguet"
 4 | __copyright__ = "Copyright 2018-2020, The Broad Institute"
 5 | __license__ = "BSD3"
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import sklearn.decomposition
10 | 
11 | from . import norm
12 | from . import stats
13 | 
14 | 
15 | def normalize_counts(gct_df, C=None, threshold=10, threshold_frac=0.1):
16 |     """
17 |     Normalize (size factors), threshold, residualize, center, unit norm
18 | 
19 |       gct_df: read counts or TPMs
20 |       C: covariates matrix
21 |     """
22 | 
23 |     gct_norm_df = gct_df.copy() / norm.deseq2_size_factors(gct_df)
24 |     for x in gct_norm_df.values:
25 |         m = x == 0
26 |         if not all(m):
27 |             x[m] = np.min(x[~m])/2
28 | 
29 |     # threshold low expressed genes: >=10 counts in >10% of samples (default)
30 |     mask = np.mean(gct_norm_df >= threshold, axis=1) > threshold_frac
31 |     gct_norm_df = np.log10(gct_norm_df[mask])
32 | 
33 |     if C is not None:
34 |         gct_norm_df = stats.residualize(gct_norm_df, C, center=False)
35 | 
36 |     gct_norm_std_df = stats.center_normalize(gct_norm_df)
37 |     return gct_norm_std_df
38 | 
39 | 
40 | def get_pcs(gct_df, normalize=True, C=None, n_components=5, return_loadings=False, random_state=None):
41 |     """
42 |     Scale input GCT, threshold, normalize and calculate PCs
43 |     """
44 |     if normalize:
45 |         gct_norm_std_df = normalize_counts(gct_df, C=C)
46 |     else:
47 |         gct_norm_std_df = gct_df
48 | 
49 |     pca = sklearn.decomposition.PCA(n_components=n_components, svd_solver='auto', random_state=random_state)
50 |     pca.fit(gct_norm_std_df.T)
51 |     P = pca.transform(gct_norm_std_df.T)
52 |     pc_df = pd.DataFrame(P, index=gct_norm_std_df.columns,
53 |                         columns=[f'PC{i}' for i in range(1, P.shape[1]+1)])
54 |     pve_s = pd.Series(pca.explained_variance_ratio_ * 100, index=pc_df.columns, name='pve')
55 |     if not return_loadings:
56 |         return pc_df, pve_s
57 |     else:
58 |         loadings_df = pd.DataFrame(pca.components_.T, index=gct_norm_std_df.index, columns=pc_df.columns)
59 |         return pc_df, pve_s, loadings_df
60 | 


--------------------------------------------------------------------------------
/qtl/pileup.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import glob
  4 | import os
  5 | import subprocess
  6 | import contextlib
  7 | import tempfile
  8 | from collections.abc import Iterable
  9 | import multiprocessing as mp
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.patches as patches
 12 | from matplotlib.colors import hsv_to_rgb, rgb2hex
 13 | import seaborn as sns
 14 | from cycler import cycler
 15 | import pyBigWig
 16 | 
 17 | from . import stats, annotation
 18 | from . import plot as qtl_plot
 19 | from . import genotype as gt
 20 | from . import core
 21 | 
 22 | 
 23 | @contextlib.contextmanager
 24 | def cd(cd_path):
 25 |     if cd_path is not None:
 26 |         saved_path = os.getcwd()
 27 |         os.chdir(cd_path)
 28 |         yield
 29 |         os.chdir(saved_path)
 30 |     else:
 31 |         yield
 32 | 
 33 | 
 34 | def _samtools_depth_wrapper(args):
 35 |     """
 36 |     Wrapper for `samtools depth`.
 37 | 
 38 |     For files on GCP, GCS_OAUTH_TOKEN must be set.
 39 |     This can be done with qtl.refresh_gcs_token().
 40 |     """
 41 |     bam_file, region_str, sample_id, bam_index_dir, flags, user_project = args
 42 | 
 43 |     cmd = f"samtools depth {flags} -r {region_str} {bam_file}"
 44 |     if user_project is not None:
 45 |         cmd += f"?userProject={user_project}"
 46 |     with cd(bam_index_dir):
 47 |         c = subprocess.check_output(cmd, shell=True).decode().strip().split('\n')
 48 | 
 49 |     df = pd.DataFrame([i.split('\t') for i in c], columns=['chr', 'pos', sample_id])
 50 |     df.index = df['pos'].astype(np.int32)
 51 |     return df[sample_id].astype(np.int32)
 52 | 
 53 | 
 54 | def samtools_depth(region_str, bam_s, bam_index_dir=None, flags='-aa -Q 255 -d 100000',
 55 |                    num_threads=12, user_project=None, verbose=True):
 56 |     """
 57 |     Run samtools depth for a list of BAMs.
 58 | 
 59 |     Note: reads with the flags [UNMAP,SECONDARY,QCFAIL,DUP] are excluded by default;
 60 |     see documentation for `samtools depth` and http://www.htslib.org/doc/samtools-flags.html
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     region_str : str
 65 |         Genomic region as 'chr:start-end'
 66 |     bam_s : pd.Series or dict
 67 |         sample_id -> bam_path
 68 |     bam_index_dir: str
 69 |         Directory already containing local copies of the BAM/CRAM indexes, or target directory
 70 |     flags : str
 71 |         Flags passed to samtools depth
 72 |     num_threads : int
 73 |         Number of threads
 74 |     user_project : str
 75 |         User project for GCP
 76 | 
 77 |     Returns
 78 |     -------
 79 |     pileups_df : pd.DataFrame
 80 |         DataFrame of pileups (samples in columns)
 81 |     """
 82 |     pileups_df = []
 83 |     with mp.Pool(processes=num_threads) as pool:
 84 |         for k,r in enumerate(pool.imap(_samtools_depth_wrapper, [(i,region_str,j,bam_index_dir,flags,user_project) for j,i in bam_s.items()]), 1):
 85 |             if verbose:
 86 |                 print(f'\r  * running samtools depth on region {region_str} for bam {k}/{len(bam_s)}', end='' if k < len(bam_s) else None)
 87 |             pileups_df.append(r)
 88 |     pileups_df = pd.concat(pileups_df, axis=1)
 89 |     pileups_df.index.name = 'position'
 90 |     return pileups_df
 91 | 
 92 | 
 93 | def read_regtools_junctions(junctions_file, convert_positions=True):
 94 |     """
 95 |     Read output from regtools junctions extract and
 96 |     convert start/end positions to intron starts/ends.
 97 |     """
 98 |     junctions_df = pd.read_csv(junctions_file, sep='\t', header=None,
 99 |                                usecols=[0, 1, 2, 4, 5, 10],
100 |                                names=['chrom', 'start', 'end', 'count', 'strand', 'block_sizes'])
101 |     if convert_positions:
102 |         junctions_df['start'] += junctions_df['block_sizes'].apply(lambda x: int(x.split(',')[0])) + 1
103 |         junctions_df['end'] -= junctions_df['block_sizes'].apply(lambda x: int(x.split(',')[1]))
104 |         junctions_df.index = (junctions_df['chrom'] + ':' + junctions_df['start'].astype(str)
105 |                               + '-' + junctions_df['end'].astype(str) + ':' + junctions_df['strand'])
106 |     return junctions_df
107 | 
108 | 
109 | def regtools_wrapper(args):
110 |     """
111 |     Wrapper for regtools junctions extract.
112 |     Filters out secondary and supplementary alignments
113 |     """
114 |     bam_file, region_str, sample_id, bam_index_dir, strand, user_project = args
115 |     with tempfile.TemporaryDirectory() as tempdir:
116 |         filtered_bam = os.path.join(tempdir, 'filtered.bam')
117 |         cmd = f"samtools view -b -F 2304 {bam_file}"
118 |         if user_project is not None:
119 |             cmd += f"?userProject={user_project}"
120 |         cmd += f" {region_str} > {filtered_bam}"
121 |         with cd(bam_index_dir):
122 |             subprocess.check_call(cmd, shell=True)
123 |         subprocess.check_call(f"samtools index {filtered_bam}", shell=True)
124 |         junctions_file = os.path.join(tempdir, 'junctions.txt.gz')
125 |         cmd = f"regtools junctions extract \
126 |             -a 8 -m 50 -M 500000 -s {strand} \
127 |             {filtered_bam} | gzip -c > {junctions_file}"
128 |         subprocess.check_call(cmd, shell=True, stderr=subprocess.DEVNULL)
129 |         junctions_df = read_regtools_junctions(junctions_file, convert_positions=True)
130 |     junctions_df.index.name = sample_id
131 |     return junctions_df
132 | 
133 | 
134 | def regtools_extract_junctions(region_str, bam_s, bam_index_dir=None, strand=0, num_threads=12,
135 |                                user_project=None, verbose=True):
136 |     """
137 |       region_str: string in 'chr:start-end' format
138 |       bam_s: pd.Series or dict mapping sample_id->bam_path
139 |       bam_index_dir: directory containing local copies of the BAM/CRAM indexes
140 |     """
141 |     core.check_dependency('regtools')
142 | 
143 |     junctions_df = []
144 |     n = len(bam_s)
145 |     with mp.Pool(processes=num_threads) as pool:
146 |         for k,df in enumerate(pool.imap(regtools_wrapper, [(i,region_str,j,bam_index_dir,strand,user_project) for j,i in bam_s.items()]), 1):
147 |             if verbose:
148 |                 print(f'\r  * running regtools junctions extract on region {region_str} for bam {k}/{n}', end='' if k < n else None)
149 |             junctions_df.append(df['count'].rename(df.index.name))
150 |     junctions_df = pd.concat(junctions_df, axis=1).infer_objects().fillna(0).astype(np.int32)
151 |     junctions_df.index.name = 'junction_id'
152 |     return junctions_df
153 | 
154 | 
155 | def norm_pileups(pileups_df, libsize_s, covariates_df=None, id_map=lambda x: '-'.join(x.split('-')[:2])):
156 |     """
157 |       pileups_df: output from samtools_depth()
158 |       libsize_s: pd.Series mapping sample_id->library size (total mapped reads)
159 |     """
160 |     # convert pileups to reads per million
161 |     pileups_rpm_df = pileups_df / libsize_s[pileups_df.columns] * 1e6
162 |     pileups_rpm_df.rename(columns=id_map, inplace=True)
163 | 
164 |     if covariates_df is not None:
165 |         residualizer = stats.Residualizer(covariates_df)
166 |         pileups_rpm_df = residualizer.transform(pileups_rpm_df)
167 | 
168 |     return pileups_rpm_df
169 | 
170 | 
171 | def group_pileups(pileups_df, libsize_s, variant_id, genotypes, covariates_df=None,
172 |                   id_map=lambda x: '-'.join(x.split('-')[:2])):
173 |     """
174 |       pileups_df: output from samtools_depth()
175 |       libsize_s: pd.Series mapping sample_id->library size (total mapped reads)
176 |     """
177 |     pileups_rpm_df = norm_pileups(pileups_df, libsize_s, covariates_df=covariates_df, id_map=id_map)
178 | 
179 |     # get genotype dosages
180 |     if isinstance(genotypes, str) and genotypes.endswith('.vcf.gz'):
181 |         g = gt.get_genotype(variant_id, genotypes)[pileups_rpm_df.columns]
182 |     elif isinstance(genotypes, pd.Series):
183 |         g = genotypes
184 |     else:
185 |         raise ValueError('Unsupported format for genotypes.')
186 | 
187 |     # average pileups by genotype or category
188 |     cols = np.unique(g[g.notnull()]).astype(int)
189 |     df = pd.concat([pileups_rpm_df[g[g == i].index].mean(axis=1).rename(i) for i in cols], axis=1)
190 |     return df
191 | 
192 | 
193 | def plot(pileup_dfs, gene, mappability_bigwig=None, variant_id=None, order='additive', junctions_df=None,
194 |          title=None, plot_variants=None, annot_track=None, max_intron=300, alpha=1, lw=0.5, junction_alpha=0.5, junction_lw=2,
195 |          highlight_introns=None, highlight_introns2=None, shade_range=None, colors=None, junction_colors=None,
196 |          ymax=None, xlim=None, rasterized=False, outline=False, labels=None,
197 |          pc_color='k', nc_color='darkgray', show_cds=True,
198 |          dl=0.75, aw=4.5, dr=0.75, db=0.5, ah=1.5, dt=0.25, ds=0.2):
199 |     """
200 |       pileup_dfs:
201 |     """
202 |     if junction_colors is None and colors is not None:
203 |         junction_colors = colors
204 | 
205 |     if isinstance(pileup_dfs, pd.DataFrame):
206 |         pileup_dfs = [pileup_dfs]
207 |     num_pileups = len(pileup_dfs)
208 | 
209 |     nt = len(gene.transcripts)
210 |     da = 0.08 * nt + 0.01*(nt-1)
211 |     da2 = 0.12
212 | 
213 |     fw = dl + aw + dr
214 |     fh = db + da + ds + (num_pileups-1)*da2 + num_pileups*ah + dt
215 |     if mappability_bigwig is not None:
216 |         fh += da2
217 | 
218 |     if variant_id is not None:
219 |         chrom, pos, ref, alt = variant_id.split('_')[:4]
220 |         pos = int(pos)
221 |         if np.issubdtype(pileup_dfs[0].columns.dtype, np.integer):  # assume that inputs are genotypes
222 |             gtlabels = np.array([
223 |                 f'{ref}:{ref}',
224 |                 f'{ref}:{alt}',
225 |                 f'{alt}:{alt}',
226 |             ])
227 |         else:
228 |             gtlabels = None
229 |     else:
230 |         pos = None
231 |         gtlabels = None
232 | 
233 |     if pileup_dfs[0].shape[1] <= 3:
234 |         cycler_colors = [
235 |             # hsv_to_rgb([0.55, 0.75, 0.8]),  #(0.2, 0.65, 0.8),  # blue
236 |             # hsv_to_rgb([0.08, 1, 1]),  #(1.0, 0.5, 0.0),   # orange
237 |             # hsv_to_rgb([0.3, 0.7, 0.7]),  #(0.2, 0.6, 0.17),  # green
238 |             '#0374B3',  # blue
239 |             '#C84646',  # red
240 |             '#C69B3A',  # gold
241 |         ]
242 |     else:
243 |         cycler_colors = [rgb2hex(i) for i in plt.cm.tab10(np.arange(10))]
244 |     custom_cycler = cycler('color', cycler_colors)
245 | 
246 |     fig = plt.figure(facecolor='none', figsize=(fw,fh))
247 |     ax = fig.add_axes([dl/fw, (db+da+ds)/fh, aw/fw, ah/fh], facecolor='none')
248 |     ax.set_prop_cycle(custom_cycler)
249 |     axv = [ax]
250 |     for i in range(1, num_pileups):
251 |         ax = fig.add_axes([dl/fw, (db+da+ds+i*(da2+ah))/fh, aw/fw, ah/fh], facecolor='none', sharex=axv[0])
252 |         ax.set_prop_cycle(custom_cycler)
253 |         axv.append(ax)
254 | 
255 |     s = pileup_dfs[0].sum()
256 |     if isinstance(order, list):
257 |         sorder = order
258 |     elif order == 'additive':
259 |         sorder = s.index
260 |         if s[sorder[0]] < s[sorder[-1]]:
261 |             sorder = sorder[::-1]
262 |     elif order == 'sorted':
263 |         sorder = np.argsort(s)[::-1]
264 |     elif order == 'none':
265 |         sorder = s.index
266 | 
267 |     gene.set_plot_coords(max_intron=max_intron)
268 |     for k,ax in enumerate(axv):
269 |         xi = gene.map_pos(pileup_dfs[k].index)
270 |         for j,i in enumerate(sorder):
271 |             if i in pileup_dfs[k]:
272 |                 if outline:
273 |                     if colors is not None:
274 |                         c = colors[i]
275 |                     else:
276 |                         c = cycler_colors[j]
277 |                     ax.plot(xi, pileup_dfs[k][i], color=c, label=i, lw=lw, alpha=alpha, rasterized=rasterized)
278 |                 else:
279 |                     ax.fill_between(xi, pileup_dfs[k][i], label=i, alpha=alpha, rasterized=rasterized)
280 | 
281 |     if labels is None:
282 |         labels = ['Mean RPM'] * num_pileups
283 |     # format
284 |     for k,ax in enumerate(axv):
285 |         ax.margins(0)
286 |         ax.set_ylabel(labels[k], fontsize=12)
287 |         qtl_plot.format_plot(ax, fontsize=10)
288 |         ax.tick_params(axis='x', length=3, width=0.6, pad=1)
289 |         ax.set_xticks(gene.map_pos(gene.get_collapsed_coords().reshape(1,-1)[0]))
290 |         ax.set_xticklabels([])
291 |         ax.spines['left'].set_position(('outward', 6))
292 | 
293 |     if xlim is not None:
294 |         ax.set_xlim(xlim)
295 |     if ymax is not None:
296 |         ax.set_ylim([0, ymax])
297 |     if gtlabels is not None:
298 |         gtlabels = gtlabels[sorder]
299 |     handles, _ = axv[-1].get_legend_handles_labels()
300 |     # leg = axv[-1].legend(handles[::-1], gtlabels[::-1], loc='upper left', handlelength=0.75, handletextpad=0.5, bbox_to_anchor=(1.02,1),
301 |     #                      labelspacing=0.2, borderaxespad=0, fontsize=10)
302 |     leg = axv[-1].legend(handles[::-1], gtlabels[::-1], loc='lower left', handlelength=0.75, handletextpad=0.5,
303 |                          labelspacing=0.2, borderaxespad=None, fontsize=10)#, framealpha=1)#, facecolor=(1, 0, 1, 0))
304 | 
305 |     for line in leg.get_lines():
306 |         line.set_linewidth(1.5)
307 | 
308 |     if plot_variants is not None and len(plot_variants) > 1:
309 |         axv[-1].add_artist(leg)#, clip_on=False)
310 | 
311 |     if variant_id is not None and title is None:
312 |         axv[-1].set_title(f"{gene.name} :: {variant_id.split('_b')[0].replace('_',':',1).replace('_','-')}", fontsize=11)
313 |     else:
314 |         axv[-1].set_title(title, fontsize=11)
315 | 
316 |     # plot variant(s)
317 |     def _plot_variant(x, color='tab:red'):
318 |         for ax in axv:
319 |             xlim = np.diff(ax.get_xlim())[0]
320 |             ylim = np.diff(ax.get_ylim())[0]
321 |             h = 0.04 * ylim
322 |             b = h/np.sqrt(3) * ah/aw * xlim/ylim
323 |             v = np.array([[x-b, -h-0.01*ylim], [x+b, -h-0.01*ylim], [x, -0.01*ylim]])
324 |             ax.add_patch(patches.Polygon(v, closed=True, color=color, ec='k', lw=0.5, clip_on=False, zorder=10))
325 | 
326 |     if isinstance(plot_variants, str):
327 |         x = gene.map_pos(int(plot_variants.split('_')[1]))
328 |         _plot_variant(x)
329 |     elif isinstance(plot_variants, Iterable):
330 |         for i in plot_variants:
331 |             ipos = int(i.split('_')[1])
332 |             x = gene.map_pos(ipos)
333 |             if pos is not None and ipos == pos:
334 |                 _plot_variant(x, color='tab:red')
335 |             else:
336 |                 _plot_variant(x, color='tab:orange')
337 |     elif plot_variants == True and pos is not None:
338 |         x = gene.map_pos(pos)
339 |         _plot_variant(x)
340 | 
341 |     if plot_variants is not None:
342 |         kwargs = {'ec':'k', 'lw':0.5, 's':20, 'marker':'^'}
343 |         h1 = ax.scatter(np.nan, np.nan, fc='tab:red', **kwargs, label='Lead')
344 |         h2 = ax.scatter(np.nan, np.nan, fc='tab:orange', **kwargs, label='Other')
345 |         if len(plot_variants) > 1:
346 |             ax.legend(handles=[h1,h2], loc='lower left', title='CS variants',
347 |                       handlelength=1, handletextpad=0.5, borderaxespad=0, bbox_to_anchor=(1.02, 0))
348 | 
349 |     ax.set_ylim([0, ax.get_ylim()[1]])
350 | 
351 |     # plot highlight/shading
352 |     if shade_range is not None:
353 |         if isinstance(shade_range, str):
354 |             shade_range = shade_range.split(':')[-1].split('-')
355 |         shade_range = np.array(shade_range).astype(int)
356 |         shade_range = gene.map_pos(shade_range)
357 |         for k in range(len(shade_range)-1):
358 |             axv[-1].add_patch(patches.Rectangle((shade_range[k], 0), shade_range[k+1]-shade_range[k], ax.get_ylim()[1],
359 |                               facecolor=[0.8]*3 if k % 2 == 0 else [0.9]*3, zorder=-10))
360 | 
361 |     # add gene model
362 |     gax = fig.add_axes([dl/fw, db/fh, aw/fw, da/fh], sharex=axv[0])
363 |     gene.plot(ax=gax, max_intron=max_intron, wx=0.2, highlight_introns=highlight_introns,
364 |               highlight_introns2=highlight_introns2, ec='none', clip_on=True,
365 |               pc_color=pc_color, nc_color=nc_color, show_cds=show_cds)
366 |     gax.set_title('')
367 |     if nt < 3:
368 |         gax.set_ylabel('Isoforms', fontsize=10, rotation=0, ha='right', va='center')
369 |     else:
370 |         gax.set_ylabel('Isoforms', fontsize=10, labelpad=15)
371 |     plt.setp(gax.get_xticklabels(), visible=False)
372 |     plt.setp(gax.get_yticklabels(), visible=False)
373 |     for s in ['top', 'right', 'bottom', 'left']:
374 |         gax.spines[s].set_visible(False)
375 |     gax.tick_params(length=0, labelbottom=False)
376 |     axv.append(gax)
377 | 
378 |     if mappability_bigwig is not None:  # add mappability
379 |         xi = gene.map_pos(pileup_dfs[0].index)
380 |         # c = gene.get_coverage(mappability_bigwig)
381 |         with pyBigWig.open(mappability_bigwig) as bw:
382 |             c = bw.values(gene.chr, int(pileup_dfs[0].index[0]-1), int(pileup_dfs[0].index[-1]), numpy=True)
383 |         mpax = fig.add_axes([dl/fw, 0.25/fh, aw/fw, da2/fh], sharex=axv[0])
384 |         mpax.fill_between(xi, c, color=3*[0.6], lw=1, interpolate=False, rasterized=rasterized)
385 |         for i in ['top', 'right']:
386 |             mpax.spines[i].set_visible(False)
387 |             mpax.spines[i].set_linewidth(0.6)
388 |         mpax.set_ylabel('Map.', fontsize=10, rotation=0, ha='right', va='center')
389 |         mpax.tick_params(axis='x', length=0, labelbottom=False)
390 |         mpax.tick_params(axis='y', labelsize=8)
391 |         mpax.spines['left'].set_position(('outward', 6))
392 |         axv.append(mpax)
393 |         plt.sca(axv[0])
394 | 
395 |     if annot_track is not None:
396 |         tax = fig.add_axes([dl/fw, 0/fh, aw/fw, da2/fh], sharex=axv[0])
397 |         gene.plot_coverage(coverage=annot_track, ax=tax, max_intron=max_intron)
398 |         tax.tick_params(length=0, labelbottom=False)
399 |     # axv[-1].set_xlabel(f'Exon coordinates on {gene.chr}', fontsize=12)
400 | 
401 |     # need to plot last since this is plotted in a separate set of axes
402 |     if junctions_df is not None:
403 |         junctions_df = junctions_df.copy()
404 |         junctions_df['start'] = junctions_df.index.map(lambda x: int(x.split(':')[-1].split('-')[0]))
405 |         junctions_df['end'] = junctions_df.index.map(lambda x: int(x.split(':')[-1].split('-')[1]))
406 |         for k,i in enumerate(sorder):
407 |             s = pileup_dfs[0][i].copy()
408 |             if junction_colors is not None:
409 |                 ec = junction_colors[i]
410 |             else:
411 |                 ec = cycler_colors[k]
412 |             gene.plot_junctions(ax, junctions_df, s, show_counts=False, align='minimum', count_col=i,
413 |                                 h=0.3, lw=junction_lw, lw_fct=np.sqrt, ec=ec, alpha=junction_alpha, clip_on=True)
414 | 
415 |     return axv
416 | 


--------------------------------------------------------------------------------
/qtl/plot.py:
--------------------------------------------------------------------------------
   1 | import numpy as np
   2 | import pandas as pd
   3 | import scipy.stats
   4 | import re
   5 | import matplotlib.pyplot as plt
   6 | import matplotlib.patches as patches
   7 | import matplotlib.ticker as ticker
   8 | import matplotlib.colors as colors
   9 | from matplotlib.colors import hsv_to_rgb, ListedColormap, LogNorm
  10 | import seaborn as sns
  11 | import scipy.cluster.hierarchy as hierarchy
  12 | from cycler import cycler
  13 | from collections.abc import Iterable
  14 | import copy
  15 | import itertools
  16 | 
  17 | from . import stats
  18 | from . import map as qtl_map
  19 | 
  20 | 
  21 | def setup_figure(aw=4.5, ah=3, xspace=[0.75,0.25], yspace=[0.75,0.25],
  22 |                  colorbar=None, ds=0.15, cw=0.12, ct=0, ch=None,
  23 |                  margin_axes=None, mx=0.5, dx=0.15, my=0.5, dy=0.15, polar=False):
  24 |     """
  25 |     """
  26 |     dl, dr = xspace
  27 |     db, dt = yspace
  28 |     fw = dl + aw + dr
  29 |     fh = db + ah + dt
  30 |     if margin_axes in ['x', 'both']:
  31 |         fw += dx + mx
  32 |     if margin_axes in ['y', 'both']:
  33 |         fh += dy + my
  34 |     fig = plt.figure(facecolor='none', figsize=(fw,fh))
  35 |     axes = [fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh], facecolor='none', zorder=1, polar=polar)]
  36 |     if margin_axes in ['y', 'both']:
  37 |         axes.append(fig.add_axes([dl/fw, (db+ah+dy)/fh, aw/fw, my/fh], sharex=axes[0], facecolor='none', zorder=0))
  38 |     if margin_axes in ['x', 'both']:
  39 |         axes.append(fig.add_axes([(dl+aw+dx)/fw, db/fh, mx/fw, ah/fh], sharey=axes[0], facecolor='none', zorder=0))
  40 |         if colorbar != 'horizontal':
  41 |             dl += dx + mx
  42 |     if colorbar == 'horizontal':
  43 |         if ch is None:
  44 |             ch = 0.4 * aw
  45 |         axes.append(fig.add_axes([(dl+aw-ch)/fw, (db+ah+ds)/fh, ch/fw, cw/fh], facecolor='none'))
  46 |     elif colorbar is not None and colorbar != False:  # vertical
  47 |         if ch is None:
  48 |             ch = 0.4 * ah
  49 |         axes.append(fig.add_axes([(dl+aw+ds)/fw, (db+ah-ch-ct)/fh, cw/fw, ch/fh], facecolor='none'))
  50 | 
  51 |     if len(axes) == 1:
  52 |         axes = axes[0]
  53 |     return axes
  54 | 
  55 | 
  56 | #     if not box:
  57 | #         ax.spines['left'].set_position(('outward', 6))
  58 | #         ax.spines['bottom'].set_position(('outward', 6))
  59 | #         ax.spines['right'].set_visible(False)
  60 | #         ax.spines['top'].set_visible(False)
  61 | #     ax.tick_params(axis='both', which='both', direction='out', labelsize=fontsize)
  62 | 
  63 | def get_axgrid(nr, nc, ntot=None, sharex=False, sharey=False,
  64 |                x_offset=6, y_offset=6, margins=None, polar=False,
  65 |                background_axes=False,
  66 |                dl=0.5, aw=2, dx=0.75, dr=0.25,
  67 |                db=0.5, ah=2, dy=0.75, dt=0.25,
  68 |                colorbar=None, ds=0.15, cw=0.15, ct=0, ch=None,
  69 |                tri=None, fontsize=10, hide=['top', 'right']):
  70 |     """
  71 |     """
  72 |     if ntot is None:
  73 |         ntot = nr * nc
  74 | 
  75 |     if not isinstance(aw, Iterable):
  76 |         aw = nc * [aw]
  77 | 
  78 |     if not isinstance(polar, Iterable):
  79 |         polar = ntot * [polar]
  80 | 
  81 |     fw = dl + sum(aw) + (nc-1)*dx + dr
  82 |     fh = db + nr*ah + (nr-1)*dy + dt
  83 |     fig = plt.figure(figsize=(fw,fh), facecolor='none')
  84 |     axes = []
  85 |     n = 0
  86 | 
  87 |     if tri is None:
  88 |         si = lambda x: 0
  89 |     elif tri == 'upper':
  90 |         si = lambda x: x
  91 | 
  92 |     for j in range(nr):
  93 |         for i in range(si(j), nc):
  94 |             if n < ntot:
  95 |                 ax = fig.add_axes([(dl+sum(aw[:i])+i*dx)/fw, (db+(nr-j-1)*(ah+dy))/fh, aw[i]/fw, ah/fh], facecolor='none', zorder=0, polar=polar[n],
  96 |                                   sharex=axes[0] if sharex and n>0 else None,
  97 |                                   sharey=axes[0] if sharey and n>0 else None)
  98 |                 if not polar[n]:
  99 |                     format_plot(ax, fontsize=fontsize, hide=hide, x_offset=x_offset, y_offset=y_offset)
 100 |                 ax.margins(margins)
 101 |                 axes.append(ax)
 102 |                 n += 1
 103 | 
 104 |     if ch is None:
 105 |         ch = ah/2
 106 | 
 107 |     # add axes in background for plotting overlays
 108 |     if background_axes:
 109 |         bax = fig.add_axes([dl/fw, db/fh, (sum(aw) + (nc-1)*dx)/fw, (nr*ah + (nr-1)*dy)/fh],
 110 |                            facecolor='none', zorder=-1, label='background',
 111 |                            sharex=axes[0] if sharex and nc == 1 else None,
 112 |                            sharey=axes[0] if sharey and nr == 1 else None)
 113 |         format_plot(bax, hide=['top', 'right', 'bottom', 'left'])
 114 |         hide_ticks(bax)
 115 |         bax.margins(0)
 116 |     else:
 117 |         bax = None
 118 | 
 119 |     # add colorbars
 120 |     if isinstance(colorbar, Iterable):
 121 |         cax = []
 122 |         for k in colorbar:
 123 |             i = k // nc  # row
 124 |             j = k - i*nc  # col
 125 |             cax.append(fig.add_axes([(dl+sum(aw[:j+1])+j*dx+ds)/fw, (db+(nr-i)*ah+(nr-i-1)*dy-ch-ct)/fh, cw/fw, ch/fh]))
 126 |     elif colorbar == True:
 127 |         cax = fig.add_axes([(dl+sum(aw)+(nc-1)*dx+ds)/fw, (db+nr*ah+(nr-1)*dy-ch-ct)/fh, cw/fw, ch/fh])
 128 |     else:
 129 |         cax = None
 130 |     r = [axes]
 131 |     if cax is not None:
 132 |         r.append(cax)
 133 |     if bax is not None:
 134 |         r.append(bax)
 135 |     if len(r) == 1:
 136 |         r = r[0]
 137 |     else:
 138 |         r = tuple(r)
 139 |     return r
 140 | 
 141 | 
 142 | def hide_ticks(ax, axis='both'):
 143 |     if axis in ['x', 'both']:
 144 |         plt.setp(ax.get_xticklabels(), visible=False)
 145 |         for line in ax.xaxis.get_ticklines():
 146 |             line.set_markersize(0)
 147 |             line.set_markeredgewidth(0)
 148 |     if axis in ['y', 'both']:
 149 |         plt.setp(ax.get_yticklabels(), visible=False)
 150 |         for line in ax.yaxis.get_ticklines():
 151 |             line.set_markersize(0)
 152 |             line.set_markeredgewidth(0)
 153 | 
 154 | 
 155 | def format_plot(ax, tick_direction='out', tick_length=4, hide=['top', 'right'],
 156 |                 hide_spines=True, lw=1, fontsize=10,
 157 |                 equal_limits=False, x_offset=0, y_offset=0, vmin=None):
 158 | 
 159 |     # ax.autoscale(False)
 160 |     for i in ['left', 'bottom', 'right', 'top']:
 161 |         ax.spines[i].set_linewidth(lw)
 162 | 
 163 |     ax.tick_params(axis='both', which='both', direction=tick_direction, labelsize=fontsize)
 164 | 
 165 |     # set tick positions
 166 |     if 'top' in hide and 'bottom' in hide:
 167 |         ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
 168 |     elif 'top' in hide:
 169 |         ax.get_xaxis().set_ticks_position('bottom')
 170 |     elif 'bottom' in hide:
 171 |         ax.get_xaxis().set_ticks_position('top')
 172 |     else:
 173 |         ax.get_xaxis().set_ticks_position('both')
 174 | 
 175 |     if 'left' in hide and 'right' in hide:
 176 |         ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
 177 |     elif 'left' in hide:
 178 |         ax.get_yaxis().set_ticks_position('right')
 179 |     elif 'right' in hide:
 180 |         ax.get_yaxis().set_ticks_position('left')
 181 |     elif len(hide) == 0:
 182 |         ax.get_xaxis().set_ticks_position('bottom')
 183 |         ax.get_yaxis().set_ticks_position('left')
 184 |     else:
 185 |         ax.get_yaxis().set_ticks_position('both')
 186 | 
 187 |     if hide_spines:
 188 |         for i in hide:
 189 |             ax.spines[i].set_visible(False)
 190 | 
 191 |     # adjust tick size
 192 |     for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines():
 193 |         line.set_markersize(tick_length)
 194 |         line.set_markeredgewidth(lw)
 195 | 
 196 |     for line in (ax.xaxis.get_ticklines(minor=True) + ax.yaxis.get_ticklines(minor=True)):
 197 |         line.set_markersize(tick_length/2)
 198 |         line.set_markeredgewidth(lw/2)
 199 | 
 200 |     ax.spines['left'].set_position(('outward', y_offset))
 201 |     ax.spines['bottom'].set_position(('outward', x_offset))
 202 | 
 203 |     if equal_limits:
 204 |         xlim = ax.get_xlim()
 205 |         ylim = ax.get_ylim()
 206 |         lims = [np.minimum(xlim[0], ylim[0]), np.maximum(xlim[1], ylim[1])]
 207 |         if vmin is not None:
 208 |             lims[0] = vmin
 209 |         ax.set_xlim(lims)
 210 |         ax.set_ylim(lims)
 211 | 
 212 |     # ax.autoscale(True)  # temporary fix?
 213 | 
 214 | 
 215 | def plot_qtl(g, p, label_s=None, label_colors=None, split=False, split_colors=None, covariates_df=None,
 216 |             legend_text=None, show_pval=False, show_slope=False, normalized=False, loc=None, ax=None, color=[0.5]*3,
 217 |             variant_id=None, jitter=0, bvec=None, boxplot=False, xlabel=None, clip_on=True,
 218 |             aw=2, ah=2, dr=0.25, dt=0.25, db=0.5, dl=0.75, width=0.75, lw=1,
 219 |             ylabel='Normalized expression', title=None, show_counts=True):
 220 |     """"""
 221 | 
 222 |     assert p.index.equals(g.index)
 223 | 
 224 |     if show_pval or show_slope:
 225 |         stats_s = qtl_map.calculate_association(g, p, covariates_df=covariates_df).iloc[0]
 226 | 
 227 |     if covariates_df is not None:
 228 |         # only residualize the phenotype for plotting
 229 |         p = stats.residualize(p.copy(), covariates_df.loc[p.index])
 230 | 
 231 |     qtl_df = pd.concat([g.round().astype(int), p], axis=1)
 232 |     qtl_df.columns = ['genotype', 'phenotype']
 233 |     if label_s is not None:
 234 |         qtl_df = pd.concat([qtl_df, label_s], axis=1, sort=False)
 235 | 
 236 |     if ax is None:
 237 |         ax = setup_figure(aw, ah, xspace=[dl, dr], yspace=[db, dt])
 238 | 
 239 |     if not normalized:
 240 |         if split and label_s is not None:
 241 |             if split_colors is None:
 242 |                 split_colors = [
 243 |                     hsv_to_rgb([0.025, 1, 0.8]),
 244 |                     hsv_to_rgb([0.575, 1, 0.8])
 245 |                 ]
 246 |             pal = sns.color_palette(split_colors)
 247 |             i = qtl_df.columns[2]
 248 |             sns.violinplot(x="genotype", y="phenotype", hue=i, hue_order=sorted(qtl_df[i].unique()),
 249 |                            data=qtl_df, palette=pal, ax=ax, order=[0,1,2], density_norm='width', cut=0, linewidth=lw, width=width)
 250 |             l = ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1), fontsize=8, handlelength=0.6, ncol=2, handletextpad=0.5, labelspacing=0.33)
 251 |             l.set_title(None)
 252 |         else:
 253 |             sns.violinplot(x="genotype", y="phenotype", data=qtl_df, width=width,
 254 |                            cut=0, ax=ax, order=[0,1,2], color=color, linewidth=lw, clip_on=clip_on)
 255 |     else:
 256 |         pass
 257 |         # if labels is not None:
 258 |         #     ax.scatter(g, p, c=labels, cmap=colors.LinearSegmentedColormap.from_list('m', label_colors), alpha=0.8, s=25, edgecolors='none')
 259 |         # else:
 260 |         #     # ax.scatter(g, p, c=hsv_to_rgb([0.55,0.8,0.8]), alpha=0.8, s=25, edgecolors='none')
 261 |         #     ax.scatter(g, p, c='k', alpha=0.66, s=25, edgecolors='none')
 262 | 
 263 |     ax.set_xlabel(xlabel, fontsize=12, labelpad=8)
 264 |     ax.set_ylabel(ylabel, fontsize=12)
 265 |     format_plot(ax, lw=1, fontsize=9, x_offset=6, y_offset=6)
 266 |     ax.set_xlim([-0.5,2.5])
 267 |     ax.spines['bottom'].set_bounds([0, 2])
 268 |     ax.yaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=5, nbins=5))
 269 | 
 270 |     if show_slope:
 271 |         x = np.array([-0.5, 2.5])
 272 |         r = scipy.stats.linregress(g, p)
 273 |         ax.plot(x, r.intercept + x*r.slope, linestyle=(0, (3, 2)), lw=2, color='tab:blue')
 274 | 
 275 |     if title is not None:
 276 |         ax.set_title(title, fontsize=12)
 277 | 
 278 |     if variant_id is not None:
 279 |         ref, alt = variant_id.split('_')[2:4]
 280 |         labels = [
 281 |             f'{ref}/{ref}',
 282 |             f'{ref}/{alt}',
 283 |             f'{alt}/{alt}',
 284 |         ]
 285 |     else:
 286 |         labels = [0, 1, 2]
 287 | 
 288 |     if show_counts:
 289 |         if not split:
 290 |             gcounts = g.astype(int).value_counts()
 291 |             labels = [f"{v}\n{gcounts.get(k, 0)}" for k,v in enumerate(labels)]
 292 |         else:
 293 |             var_s = qtl_df[qtl_df.columns[2]]
 294 |             c = np.unique(var_s)
 295 |             assert len(c) == 2
 296 | 
 297 |             gcounts1 = g[var_s == c[0]].value_counts().reindex(np.arange(3), fill_value=0)
 298 |             gcounts2 = g[var_s == c[1]].value_counts().reindex(np.arange(3), fill_value=0)
 299 |             labels = [f"{v}\n({gcounts1[k]},{gcounts2[k]})" for k,v in enumerate(labels)]
 300 |     ax.set_xticks(range(len(labels)), labels)
 301 | 
 302 |     if show_pval:
 303 |         if stats_s['slope'] > 0:
 304 |             ax.text(0.05, 1, f"P = {stats_s['pval_nominal']:.2g}", va='top', ha='left', transform=ax.transAxes, fontsize=11)
 305 |         else:
 306 |             ax.text(0.95, 1, f"P = {stats_s['pval_nominal']:.2g}", va='top', ha='right', transform=ax.transAxes, fontsize=11)
 307 | 
 308 |     return ax
 309 | 
 310 | 
 311 | def plot_interaction(p, g, i, variant_id=None, annot=None, covariates_df=None, lowess=None,
 312 |                      xlabel=None, ylabel=None, title=None, alpha=0.8, s=20, fontsize=14,
 313 |                      ah=3, aw=3):
 314 |     """
 315 |     Plot interaction QTL
 316 | 
 317 |     Model:
 318 |       p = b0 + b1*g + b2*i + b3*gi
 319 | 
 320 |     Args:
 321 |       lowess: fraction of data to use [0,1]
 322 |     """
 323 | 
 324 |     assert p.index.equals(g.index) and p.index.equals(i.index)
 325 | 
 326 |     if covariates_df is not None:
 327 |         assert p.index.equals(covariates_df.index)
 328 |         X = np.c_[len(g)*[1],g,i,g*i,covariates_df]
 329 |     else:
 330 |         X = np.c_[len(g)*[1],g,i,g*i]
 331 |     b,_,_,_ = np.linalg.lstsq(X, p, rcond=None)
 332 | 
 333 |     if variant_id is not None:
 334 |         ref, alt = variant_id.split('_')[2:4]
 335 |     else:
 336 |         ref, alt = 'ref', 'alt'
 337 |     labels = {
 338 |         0: f'{ref}/{ref}',
 339 |         1: f'{ref}/{alt}',
 340 |         2: f'{alt}/{alt}',
 341 |     }
 342 | 
 343 |     ax = setup_figure(ah, aw)
 344 |     ax.margins(0.02)
 345 | 
 346 |     custom_cycler = cycler('color', [
 347 |         # hsv_to_rgb([0.55,1,0.8]),
 348 |         # sns.color_palette("Paired")[7],  # orange
 349 |         # hsv_to_rgb([0,1,0.8]),
 350 |         sns.color_palette("husl", 8)[5], # blue
 351 |         sns.color_palette("Paired")[7],  # orange
 352 |         sns.color_palette("Paired")[3],  # green
 353 |     ])
 354 |     ax.set_prop_cycle(custom_cycler)
 355 | 
 356 |     gorder = [0,1,2]
 357 |     # gorder = [2,1,0]
 358 |     # mu = [p[g==g0].mean() for g0 in np.unique(g)]
 359 |     # if mu[0]<mu[2]:
 360 |     #     gorder = gorder[::-1]
 361 |     for d in gorder:
 362 |         ix = g[g == d].index
 363 |         ax.scatter(i[ix], p[ix], s=s, alpha=alpha, edgecolor='none', label=labels[d], clip_on=False)
 364 |         if lowess is not None:
 365 |             lw = sm.nonparametric.lowess(p[ix], i[ix], lowess)
 366 |             ax.plot(lw[:, 0], lw[:, 1], '--', lw=2)
 367 |     format_plot(ax, fontsize=12)
 368 |     xlim = np.array(ax.get_xlim())
 369 |     for d in gorder:  # regression lines
 370 |         y = lambda x: b[0] + b[1]*d + b[2]*x + b[3]*d*x
 371 |         ax.plot(xlim, y(xlim), '-', lw=1.5)
 372 | 
 373 |     leg = ax.legend(fontsize=12, labelspacing=0.25, handletextpad=0, borderaxespad=0, handlelength=1.5)
 374 |     for lh in leg.legendHandles:
 375 |         lh.set_alpha(1)
 376 | 
 377 |     ax.xaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=3, integer=True, nbins=4))
 378 |     ax.yaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=3, integer=True, nbins=4))
 379 | 
 380 |     if xlabel is None:
 381 |         xlabel = i.name
 382 |     if ylabel is None:
 383 |         try:
 384 |             ylabel = annot.get_gene(p.name).name
 385 |         except:
 386 |             pass
 387 |     ax.set_xlabel(xlabel, fontsize=fontsize)
 388 |     ax.set_ylabel(ylabel, fontsize=fontsize)
 389 |     if title is None:
 390 |         title = variant_id
 391 |     ax.set_title(title, fontsize=fontsize)
 392 |     ax.spines['bottom'].set_position(('outward', 6))
 393 |     ax.spines['left'].set_position(('outward', 6))
 394 |     return ax
 395 | 
 396 | 
 397 | def plot_ld(ld_df, ld_threshold=0.1, s=0.25, alpha=1, yscale=3, xunit=1e6, ld_bounds=None,
 398 |             cmap=plt.cm.Greys, start_pos=None, end_pos=None, ax=None, cax=None,
 399 |             clip_on=False, rasterized=True):
 400 |     """"""
 401 | 
 402 |     assert ld_df.index.equals(ld_df.columns)
 403 |     ld_df = ld_df.copy()
 404 |     pos = ld_df.index.map(lambda x: int(x.split('_')[1]))
 405 | 
 406 |     # drop duplicates (multi-allelic sites)
 407 |     m = ~pos.duplicated()
 408 |     ld_df = ld_df.loc[m, ld_df.columns[m]]
 409 | 
 410 |     variant_df = pd.DataFrame(index=ld_df.index)
 411 |     variant_df['chr'] = variant_df.index.map(lambda x: x.split('_')[0])
 412 |     variant_df['pos'] = pos[m]
 413 | 
 414 |     if start_pos is None:
 415 |         start_pos = variant_df['pos'].iloc[0]
 416 |     if end_pos is None:
 417 |         end_pos = variant_df['pos'].iloc[-1]
 418 | 
 419 |     ld_df.rename(index=variant_df['pos'], columns=variant_df['pos'], inplace=True)
 420 |     ld_df.columns.name = 'col'
 421 |     ld_df.index.name = 'row'
 422 |     ld_df.values[np.triu_indices(ld_df.shape[0])] = np.nan
 423 | 
 424 |     v = ld_df.stack().reset_index()
 425 |     v = v[v[0] >= ld_threshold]
 426 |     X = v[['row', 'col']].copy().values.T
 427 |     X[1,:] -= start_pos
 428 |     x0 = np.array([[start_pos, 0]]).T
 429 |     R = np.array([[1, 1], [-1, 1]])/np.sqrt(2)
 430 | 
 431 |     # set up figure
 432 |     if ax is None:
 433 |         pad = 0.1
 434 |         dl = pad
 435 |         aw = 8
 436 |         dr = 0.5
 437 |         db = 0.5
 438 |         ah = aw/yscale  # must also scale ylim below
 439 |         dt = pad
 440 |         fw = dl+aw+dr
 441 |         fh = db+ah+dt
 442 |         ds = 0.1
 443 |         fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
 444 |         ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
 445 |         cax = fig.add_axes([(dl+aw+ds)/fw, db/fh, 0.1/fw, 0.8/fh])
 446 | 
 447 |     # plot
 448 |     X = np.dot(R, X-x0)/np.sqrt(2) + x0
 449 |     order = np.argsort(v[0])
 450 |     h = ax.scatter(X[0,order]/xunit, X[1,order]/xunit, s=s, c=v[0].iloc[order], marker='D', clip_on=clip_on,
 451 |                    alpha=alpha, edgecolor='none', cmap=cmap, vmin=0, vmax=1, rasterized=rasterized)
 452 | 
 453 |     if cax is not None:
 454 |         hc = plt.colorbar(h, cax=cax)
 455 |         hc.set_label('$\mathregular{R^2}$', fontsize=12, rotation=0, ha='left', va='center')
 456 |         hc.locator = ticker.MaxNLocator(min_n_ticks=3, nbins=2)
 457 |     xlim = np.array([start_pos, end_pos]) / xunit
 458 |     ax.set_xlim(xlim)
 459 |     ax.set_ylim([-np.diff(xlim)[0]/yscale, 0])
 460 | 
 461 |     for s in ['left', 'top', 'right']:
 462 |         ax.spines[s].set_visible(False)
 463 |     ax.set_yticks([])
 464 | 
 465 |     ax.set_xlabel(f"Position on {variant_df['chr'].iloc[0]} (Mb)", fontsize=14)
 466 | 
 467 |     if ld_bounds is not None:
 468 |         ci = (ld_bounds[:-1] + ld_bounds[1:]) / 2  # center position for each block
 469 |         y = -np.diff(ld_bounds) / 2 / xunit
 470 |         yi = np.array([i for i in itertools.chain(*itertools.zip_longest(np.zeros(len(ld_bounds)), y)) if i is not None])
 471 |         xi = np.array([i for i in itertools.chain(*itertools.zip_longest(ld_bounds, ci)) if i is not None])
 472 |         ax.plot(xi/xunit, yi, linestyle=(0, (4, 3)), lw=1, color='tab:red')
 473 | 
 474 |     return ax
 475 | 
 476 | 
 477 | def plot_locus_summary(region_str, tracks_dict=None, ld_df=None, ld_bounds=None, coverage_cat=None,
 478 |                        track_colors=None, labels=None, order=None,
 479 |                        pip_df=None, pip_order=None, pip_colors=None, pip_legend=False,
 480 |                        gene=None, ld_marker_size=1, aw=6, ah=4, dl=2, dr=1, ph=0.1, gh=0.15):
 481 |     """
 482 |     Visualization of genetic locus, combining coverage tracks (e.g., ATAC-seq),
 483 |     variants (e.g., fine-mapped QTLs), genes/transcripts, and LD.
 484 | 
 485 |     Inputs:
 486 |       pip_df: variants with PIPs from fine-mapping
 487 |       tracks_dict: tracks
 488 |       ld_df:  LD matrix
 489 |     """
 490 | 
 491 |     if isinstance(tracks_dict, dict):
 492 |         ntracks = len(tracks_dict)
 493 |         ah = ntracks*0.2
 494 |     elif isinstance(tracks_dict, pd.DataFrame):
 495 |         ntracks = tracks_dict.shape[1]
 496 |     else:
 497 |         ntracks = 0
 498 |         ah = 0
 499 | 
 500 |     if gene is not None:
 501 |         if not isinstance(gene, Iterable):
 502 |             gene = [gene]
 503 |         gh = gh * len(gene)  # gene model
 504 |     else:
 505 |         gh = 0
 506 | 
 507 |     db = 0.5
 508 |     ldh = 1  # LD plot
 509 |     ds0 = 0
 510 |     ds = 0.1
 511 |     if pip_df is None:
 512 |         ph = 0
 513 |     elif isinstance(pip_df, pd.DataFrame):
 514 |         pip_df = [pip_df]
 515 |     pah = [ph*df['trait_id'].nunique() for df in pip_df]
 516 | 
 517 |     dt = 0.25
 518 |     fw = dl + aw + dr
 519 |     fh = db + ldh + ds0 + gh + ds + sum(pah) + ds*(len(pah)-1) + ds + ah + dt
 520 |     fig = plt.figure(figsize=(fw,fh))
 521 |     axes = []
 522 |     if tracks_dict is not None:
 523 |         ax =  fig.add_axes([dl/fw, (db+ldh+ds0+gh+ds+sum(pah)+len(pah)*ds)/fh, aw/fw, ah/fh], facecolor='none')
 524 |         axes.append(ax)
 525 |     if pip_df is not None:
 526 |         faxes = []
 527 |         for k,h in enumerate(pah):
 528 |             fax = fig.add_axes([dl/fw, (db+ldh+ds0+gh+ds+(len(pah)-1-k)*ds+sum(pah[k+1:]))/fh, aw/fw, h/fh],
 529 |                                facecolor='none', sharex=axes[0] if len(axes)>0 else None)
 530 |             faxes.append(fax)
 531 |             axes.append(fax)
 532 |     if gene is not None:
 533 |         gax = fig.add_axes([dl/fw, (db+ldh+ds0)/fh, aw/fw, gh/fh], facecolor='none')
 534 |         axes.append(gax)
 535 |     if ld_df is not None:
 536 |         lax = fig.add_axes([dl/fw, db/fh, aw/fw, ldh/fh], facecolor='none', sharex=axes[0] if len(axes)>0 else None)
 537 |         lcax = fig.add_axes([(dl+aw+ds)/fw, (db+ldh/2)/fh, 0.1/fw, ldh/2/fh], facecolor='none')
 538 |         axes.append(lax)
 539 | 
 540 |     chrom, start_pos, end_pos = re.split(':|-', region_str)
 541 |     start_pos = int(start_pos)
 542 |     end_pos = int(end_pos)
 543 |     x = np.arange(start_pos, end_pos+1) / 1e6
 544 | 
 545 |     if order is not None:
 546 |         labels = order
 547 |     elif tracks_dict is not None:
 548 |         labels = list(tracks_dict.keys())
 549 | 
 550 |     if isinstance(tracks_dict, dict):
 551 |         #maxv = np.max([np.max(tracks_dict[k]) for k in labels])
 552 |         for k, label in enumerate(labels):
 553 |             c = tracks_dict[label]
 554 |             y0 = (ntracks-1-k) * np.ones(len(x))  # vertical offset
 555 |             # if label in track_colors:
 556 |             #     color = track_colors[label]
 557 |             # else:
 558 |             #     color = 'k'
 559 |             # ax.fill_between(x, 0.8*c/np.nanmax(c)+y0, y0,
 560 |             # ax.fill_between(x, 0.8*c/15+y0, y0,
 561 |             maxv = np.max(c)
 562 |             ax.fill_between(x, 0.9*c/maxv + y0, y0, antialiased=False, linewidth=1,
 563 |                             facecolor=track_colors.get(label, 'k') if track_colors is not None else 'k',
 564 |                             clip_on=True, rasterized=True)
 565 | 
 566 |         ax.set_yticks(np.arange(ntracks))
 567 |         ax.set_yticklabels([i.replace('_', ' ') for i in labels[::-1]], fontsize=8, va='bottom')
 568 |         ax.spines['left'].set_bounds((0, ntracks-1))
 569 |         ax.spines['left'].set_position(('outward', 6))
 570 |         format_plot(ax, fontsize=8, hide=['top', 'right', 'bottom'], y_offset=6)
 571 |         ax.set_ylim([-0.5, ntracks-0.5])
 572 | 
 573 |     elif isinstance(tracks_dict, pd.DataFrame):  # plot as heatmap
 574 |         ax.invert_yaxis()
 575 |         ax.imshow(np.log10(tracks_dict.values.T),
 576 |                   #extent=(),
 577 |                   aspect='auto', interpolation='none')
 578 |         ax2 = fig.add_axes([(dl-0.2)/fw, (db+ldh+ds0+gh+ds+ph+ds)/fh, 0.1/fw, ah/fh], sharey=ax)
 579 |         # format_plot(ax, fontsize=8, hide=['top', 'right', 'bottom', 'left'])
 580 |         format_plot(ax2, fontsize=8, hide=['top', 'right', 'bottom', 'left'])
 581 |         ax2.set_xticks([])
 582 | 
 583 |         cohort_index_dict = {i:k for k,i in enumerate(np.unique(coverage_cat))}
 584 |         # if cohort_colors is None:
 585 |         n = len(cohort_index_dict)
 586 |         cmap = ListedColormap(plt.cm.get_cmap('Spectral', n)(np.arange(n)), 'indexed')
 587 |         # else:
 588 |         #     cmap = ListedColormap(np.stack(pd.Series(cohort_index_dict).sort_values().index.map(cohort_colors)))
 589 | 
 590 |         # if orientation == 'vertical':
 591 |         ax2.imshow(coverage_cat.map(cohort_index_dict).values.reshape(-1,1), aspect='auto', cmap=cmap)
 592 |         # else:
 593 |         #     ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(1,-1), aspect='auto', cmap=cmap)
 594 |         #
 595 |         # if lax is None:
 596 |         #     lax = ax
 597 |         for k,i in cohort_index_dict.items():
 598 |             ax.scatter(np.nan, np.nan, marker='s', c=[cmap(i)], label=f'{k}')
 599 |         # if legend:
 600 |         ax.legend(loc='upper right', borderaxespad=None, bbox_to_anchor=(-0.05,1), handlelength=1, title='Taxonomy', fontsize=8)
 601 |         ax.set_ylim([ntracks-0.5, -0.5])
 602 |         ax.set_yticks([])
 603 | 
 604 |     if tracks_dict is not None:
 605 |         plt.setp(ax.get_xticklabels(), visible=False)
 606 |     if len(axes) > 0:
 607 |         axes[0].set_xlim([x[0], x[-1]])
 608 | 
 609 |     if pip_df is not None:
 610 |         for k, df in enumerate(pip_df):
 611 |             fax = faxes[k]
 612 |             pip_order = df['trait_category'].unique()  # TODO: add back as option?
 613 |             i = 0
 614 |             traits = []
 615 |             for category_id in pip_order:
 616 |                 cdf = df[df['trait_category'] == category_id]
 617 |                 for i,(trait_id,gdf) in enumerate(cdf.groupby('trait_id', observed=True), i+1):
 618 |                     traits.append(trait_id)
 619 |                     fax.scatter(gdf['position']/1e6, [i-1]*gdf.shape[0], s=30*gdf['pip'],
 620 |                                 color=pip_colors.get(category_id, 'k') if pip_colors is not None else 'k', edgecolor='none', clip_on=False)
 621 |                 if cdf.shape[0] > 0:
 622 |                     fax.scatter(np.nan, np.nan, s=20, color=pip_colors.get(category_id, 'k') if pip_colors is not None else 'k', label=category_id, edgecolor='none')
 623 |             fax.invert_yaxis()
 624 |             fax.set_yticks(np.arange(len(traits)))
 625 | 
 626 |             fax.spines['bottom'].set_visible(False)
 627 |             fax.spines['top'].set_visible(False)
 628 |             fax.spines['left'].set_position(('outward', 6))
 629 |             fax.spines['left'].set_bounds((0, i-1))
 630 |             fax.spines['right'].set_visible(False)
 631 |             fax.set_xlim([x[0], x[-1]])
 632 |             fax.set_yticklabels(traits, fontsize=7)
 633 |             plt.setp(fax.get_xticklabels(), visible=False)
 634 |             for line in fax.xaxis.get_ticklines():
 635 |                 line.set_markersize(0)
 636 |                 line.set_markeredgewidth(0)
 637 | 
 638 |             if pip_legend == True:
 639 |                 fax.legend(loc='upper left', borderaxespad=0, borderpad=0.25, bbox_to_anchor=(1.01,1), fontsize=8, handlelength=0.75, handletextpad=0.5, labelspacing=0)
 640 | 
 641 | 
 642 |     if gene is not None:
 643 |         for k,g in enumerate(gene[::-1]):
 644 |             g.plot(ax=gax, max_intron=1e9, pc_color='k', nc_color='k', ec='none', yoffset=k, scale=0.33, clip_on=True)
 645 |             gax.annotate(g.name, (g.end_pos, k),
 646 |                          xytext=(5,0), textcoords='offset points',
 647 |                          va='center', ha='left', fontsize=10)
 648 |         xlim = np.array([x[0], x[-1]])*1e6
 649 |         gax.set_xlim(xlim)
 650 |         gax.set_xticks([])
 651 |         gax.set_yticks([])
 652 |         gax.spines['bottom'].set_visible(False)
 653 |         gax.spines['top'].set_visible(False)
 654 |         gax.spines['left'].set_visible(False)
 655 |         gax.spines['right'].set_visible(False)
 656 | 
 657 |     if ld_df is not None:
 658 |         format_plot(lax, fontsize=10)
 659 |         plot_ld(ld_df, ld_bounds=ld_bounds, start_pos=start_pos, end_pos=end_pos, cmap=plt.cm.Greys,
 660 |                 s=ld_marker_size, clip_on=True, yscale=aw/ldh, ax=lax, cax=lcax)
 661 | 
 662 |     if len(axes) > 0:
 663 |         axes[-1].set_xlabel(f"Position on {chrom} (Mb)", fontsize=12)
 664 | 
 665 |     return axes
 666 | 
 667 | 
 668 | def plot_effects(dfs, args, ax=None,
 669 |                  xspace=[2.25,2,0.5], yspace=[0.5,3,0.5], xlim=None,
 670 |                  xlabel='log$\mathregular{_{2}}$(Fold enrichment)', ylabel=None):
 671 |     """"""
 672 | 
 673 |     if isinstance(dfs, pd.DataFrame):
 674 |         dfs = [dfs]
 675 |         args = [args]
 676 |     ix = dfs[0].index.tolist()
 677 |     for df in dfs[1:]:
 678 |         assert np.all(df.index == ix)
 679 | 
 680 |     if ax is None:
 681 |         dl, aw, dr = xspace
 682 |         db, ah, dt = yspace
 683 | 
 684 |         fw = dl + aw + dr
 685 |         fh = db + ah + dt
 686 |         fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
 687 |         ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
 688 | 
 689 |     if xlim is not None:
 690 |         ax.set_xlim(xlim)
 691 |     y = np.arange(len(ix))
 692 |     ax.set_ylim([y[0]-0.5, y[-1]+0.5])
 693 | 
 694 |     ax.plot([0,0], [-0.5,len(ix)-0.5], '--', color=[0.33]*3, lw=1, zorder=-8)
 695 | 
 696 |     n = len(dfs)
 697 |     d = 0
 698 |     if n == 2:
 699 |         # d = [-0.25, 0.25]
 700 |         # d = [-0.2, 0.2]
 701 |         d = [-0.15,0.15]
 702 |     elif n == 3:
 703 |         d = [-0.25, 0, 0.25]
 704 |     elif n == 4:
 705 |         d = [-0.25, -0.15, 0.15, 0.25]
 706 | 
 707 |     for k,df in enumerate(dfs):
 708 |         mean_col = df.columns[0]
 709 |         ci_cols = df.columns[1:]
 710 |         delta = (df[ci_cols].T - df[mean_col]).abs()
 711 |         ax.errorbar(df[mean_col], y+d[k], xerr=delta.values, **args[k])
 712 | 
 713 |     if xlim is None:
 714 |         xlim = ax.get_xlim()
 715 |     for i in y:
 716 |         if i % 2 == 0:
 717 |             c = [0.95]*3
 718 |             c = [1]*3
 719 |         else:
 720 |             c = [0.75]*3
 721 |             c = [0.9]*3
 722 |         patch = patches.Rectangle((xlim[0], i-0.5), np.diff(xlim), 1, fc=c, zorder=-10)
 723 |         ax.add_patch(patch)
 724 | 
 725 |     ax.set_xlabel(xlabel, fontsize=12)
 726 |     if ylabel is not None:
 727 |         ax.set_ylabel(ylabel, fontsize=12)
 728 |     ax.set_yticks(y)
 729 |     ax.set_yticklabels(ix)
 730 | 
 731 |     ax.invert_yaxis()
 732 |     return ax
 733 | 
 734 | 
 735 | def _qq_scatter(ax, pval, ntests=None, label=None, c=None, zorder=None,
 736 |                 max_values=100000, step=1000, is_sorted=False, args=None):
 737 |     """"""
 738 |     if ntests is None:
 739 |         ntests = len(pval)
 740 |     n = len(pval)
 741 |     if n > max_values:
 742 |         xi = np.array(list(range(1, max_values+1)) + list(range(max_values+step, n+step, step)))
 743 |     else:
 744 |         xi = np.arange(1, n+1)
 745 |     x = -np.log10(xi/(ntests+1))
 746 | 
 747 |     if not is_sorted:
 748 |         log_pval_sorted = -np.log10(np.sort(pval))
 749 |     else:
 750 |         log_pval_sorted = -np.log10(pval)
 751 | 
 752 |     ax.scatter(x, list(log_pval_sorted[:max_values]) + list(log_pval_sorted[max_values::step]),
 753 |                c=c, zorder=zorder, label=label, **args)
 754 | 
 755 | 
 756 | def qqplot(pval, pval_null=None, ntests=None, ntests_null=None, max_values=100000, step=1000, is_sorted=False,
 757 |            title='', labels=None, fontsize=12, ax=None, equal_axes=False):
 758 |     """QQ-plot
 759 | 
 760 |       ntests: total number of tests if not equal to len(pval),
 761 |               e.g., if only tail of p-value distribution is provided
 762 |     """
 763 |     if labels is None:
 764 |         labels = ['', '']
 765 |     if ntests is None:
 766 |         ntests = len(pval)
 767 | 
 768 |     if ax is None:
 769 |         ax = setup_figure(2,2)
 770 |     ax.margins(x=0.02, y=0.05)
 771 |     args = {'s':16, 'edgecolor':'none', 'clip_on':False, 'alpha':1, 'rasterized':True}
 772 | 
 773 |     # Q-Q plot for pval
 774 |     _qq_scatter(ax, pval, ntests=ntests, label=labels[0], c=None, zorder=30,
 775 |                 max_values=max_values, step=step, is_sorted=is_sorted, args=args)
 776 | 
 777 |     # Q-Q plot for null
 778 |     if pval_null is not None:
 779 |         _qq_scatter(ax, pval_null, ntests=ntests_null, label=labels[1], c=[[0.5]*3], zorder=20,
 780 |                     max_values=max_values, step=step, is_sorted=is_sorted, args=args)
 781 | 
 782 |     ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=5, nbins=4))
 783 |     ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=5, nbins=4))
 784 | 
 785 |     ax.set_xlabel('Expected -log$\mathregular{_{10}}$(p-value)', fontsize=fontsize)
 786 |     ax.set_ylabel('Observed -log$\mathregular{_{10}}$(p-value)', fontsize=fontsize)
 787 |     format_plot(ax, fontsize=fontsize-2)
 788 | 
 789 |     xlim = ax.get_xlim()
 790 |     ylim = ax.get_ylim()
 791 |     if equal_axes:
 792 |         m = np.maximum(xlim[1], ylim[1])
 793 |         ax.set_xlim([0, m])
 794 |         ax.set_ylim([0, m])
 795 |     else:
 796 |         ax.set_xlim([0, xlim[1]])
 797 |         ax.set_ylim([0, ylim[1]])
 798 | 
 799 |     # plot confidence interval
 800 |     ci = 0.95
 801 |     xi = np.linspace(1, ntests, 100000)
 802 |     x = -np.log10(xi/(ntests+1))
 803 |     clower = -np.log10(scipy.stats.beta.ppf((1-ci)/2, xi, xi[::-1]))
 804 |     cupper = -np.log10(scipy.stats.beta.ppf((1+ci)/2, xi, xi[::-1]))
 805 |     ax.fill_between(x, cupper, clower, color=[[0.8]*3], clip_on=True, rasterized=True)
 806 |     b = -np.log10([ntests/(ntests+1), 1/(ntests+1)])
 807 |     ax.autoscale(False)
 808 |     ax.plot(b, b, '--', lw=1, color=[0.2]*3, zorder=50, clip_on=True)
 809 | 
 810 |     ax.spines['left'].set_position(('outward', 6))
 811 |     ax.spines['bottom'].set_position(('outward', 6))
 812 |     ax.set_title(f'{title}', fontsize=12)
 813 |     if labels[0] != '':
 814 |         ax.legend(loc='upper left', fontsize=10, handlelength=0.5, handletextpad=0.33)
 815 |     return ax
 816 | 
 817 | 
 818 | class CohortLabel(object):
 819 |     def __init__(self, cohort_s, cmap=None, colors=None, label_pos='left', vmin=None, vmax=None, bad_color=None):
 820 |         assert cmap is not None or colors is not None
 821 |         assert not cohort_s.index.duplicated().any()
 822 |         if cohort_s.dtype == 'O':
 823 |             cohort_s = cohort_s.astype('category')
 824 |         self.cohort_s = cohort_s
 825 |         if cmap is not None and bad_color is not None:
 826 |             cmap = copy.copy(cmap)
 827 |             cmap.set_bad(bad_color, 1)
 828 |         self.cmap = cmap
 829 |         self.vmin = vmin
 830 |         self.vmax = vmax
 831 |         self.name = cohort_s.name
 832 |         self.label_pos = label_pos
 833 |         self.colors = colors
 834 | 
 835 |         if cohort_s.dtype.name == 'category':
 836 |             # get numerical index
 837 |             self.values_s = cohort_s.astype(str).map({j:i for i,j in enumerate(cohort_s.cat.categories)})
 838 |             if colors is None:
 839 |                 n = len(cohort_s.cat.categories)
 840 |                 colors = cmap(np.linspace(0, 1, np.maximum(n, 5)))
 841 |                 self.colors = {k:v for k,v in zip(cohort_s.cat.categories, colors)}
 842 |             self.cmap = ListedColormap(cohort_s.cat.categories.map(self.colors))
 843 |         else:
 844 |             self.values_s = cohort_s
 845 | 
 846 |     def plot(self, ix=None, ax=None, show_frame=False):
 847 |         if ax is None:
 848 |             ax, cax = setup_figure(2, 0.5, colorbar=True, ch=0.5)
 849 |             # ax, cax = setup_figure(0.5, 2, colorbar=True, ch=0.5)
 850 | 
 851 |         if ix is None:
 852 |             x = self.values_s.values
 853 |         else:
 854 |             x = self.values_s[ix].values
 855 | 
 856 |         # detect orientation
 857 |         bbox = ax.get_window_extent().transformed(ax.get_figure().dpi_scale_trans.inverted())
 858 |         width, height = bbox.width, bbox.height
 859 |         if width > height:
 860 |             x = x.reshape(1, -1)
 861 |         else:
 862 |             x = x.reshape(-1, 1)
 863 |         h = ax.imshow(x, aspect='auto', cmap=self.cmap, interpolation='none', origin='lower')
 864 |         if width > height:
 865 |             if self.label_pos == 'left':
 866 |                 ax.set_ylabel(self.name, fontsize=10, rotation=0, va='center', ha='right')
 867 |             elif self.label_pos == 'right':
 868 |                 ax.yaxis.set_label_position('right')
 869 |                 ax.set_ylabel(self.name, fontsize=10, rotation=0, va='center', ha='left')
 870 |             plt.setp(ax.get_yticklabels(), visible=False)
 871 | 
 872 |         if not show_frame:
 873 |             for i in ax.spines:
 874 |                 ax.spines[i].set_visible(False)
 875 |         plt.setp(ax.get_xticklabels(), visible=False)
 876 |         plt.setp(ax.get_yticklabels(), visible=False)
 877 | 
 878 |         for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines():
 879 |             line.set_markersize(0)
 880 |             line.set_markeredgewidth(0)
 881 | 
 882 |         # prepare legend
 883 |         if self.cohort_s.dtype.name == 'category':
 884 |             for c in self.cohort_s.cat.categories:
 885 |                 ax.scatter(np.nan, np.nan, c=[self.colors[c]], label=c, s=30, marker='s')
 886 | 
 887 | 
 888 | def check_labels(labels):
 889 |     if labels is not None:
 890 |         if isinstance(labels, CohortLabel):
 891 |             labels = [labels]
 892 |         else:
 893 |             assert all([isinstance(i, CohortLabel) for i in labels])
 894 |         n = len(labels)
 895 |     else:
 896 |         n = 0
 897 |     return labels, n
 898 | 
 899 | 
 900 | def clustermap(df, Zx=None, Zy=None, cluster=True, aw=3, ah=3, lw=1, vmin=None, vmax=None, cmap=plt.cm.Blues,
 901 |                norm=None, origin='lower', dendrogram_pos='top', col_labels=None, row_labels=None,
 902 |                fontsize=10, clabel='', cfontsize=10, label_colors=None, colorbar=True, colorbar_orientation='vertical',
 903 |                method='average', metric='euclidean', optimal_ordering=False, value_labels=False,
 904 |                show_xlabels=False, show_ylabels=False, tick_length=0, rotation=-45, ha='left', va='top',
 905 |                tri=False, rasterized=False, count_sort=False,
 906 |                show_frame=False, dl=1, dr=1, dt=0.2, lh=0.1, ls=0.01,
 907 |                db=1.5, dd=0.4, ds=0.03, ch=1, cw=0.175, dc=0.1, dtc=0):
 908 |     """"""
 909 |     col_labels, nc = check_labels(col_labels)
 910 |     row_labels, nr = check_labels(row_labels)
 911 | 
 912 |     if Zx is None and cluster:
 913 |         Zx = hierarchy.linkage(df.T, method=method, metric=metric, optimal_ordering=optimal_ordering)
 914 |         Zy = hierarchy.linkage(df,   method=method, metric=metric, optimal_ordering=optimal_ordering)
 915 |     elif Zy is None and cluster:
 916 |         Zy = Zx
 917 | 
 918 |     fw = dl + aw + dr + nr*(lh+ls)
 919 |     fh = db + ah + ds + dd + dt + nc*(lh+ls)
 920 |     fig = plt.figure(figsize=(fw,fh), facecolor='none')
 921 |     dl2 = dl + nr*(lh+ls)
 922 |     if dendrogram_pos == 'top':
 923 |         ax = fig.add_axes([dl2/fw, db/fh, aw/fw, ah/fh])
 924 |         # column labels
 925 |         tax = []
 926 |         for k in range(nc):
 927 |             tax.append(
 928 |                 fig.add_axes([dl2/fw, (db+ah+(k+1)*ls+k*lh)/fh, aw/fw, lh/fh], sharex=ax)
 929 |             )
 930 |         # row labels
 931 |         lax = []
 932 |         for k in range(nr):
 933 |             lax.append(
 934 |                 fig.add_axes([(dl+k*(lh+ls))/fw, db/fh, lh/fw, ah/fh], sharey=ax)
 935 |             )
 936 |         # dendrogram
 937 |         dax = fig.add_axes([dl2/fw,         (db+ah+nc*(ls+lh)+ds)/fh, aw/fw, dd/fh])
 938 |         axes = [ax, *lax, *tax, dax]
 939 |         # colorbar
 940 |         if colorbar:
 941 |             if colorbar_orientation == 'vertical':
 942 |                 cax = fig.add_axes([(dl2+aw+dc)/fw, (db+ah-ch-dtc)/fh, cw/fw, ch/fh], label='Colorbar')
 943 |             else:
 944 |                 cax = fig.add_axes([(dl2+aw-ch-dtc)/fw, (db-cw-dc)/fh, ch/fw, cw/fh], label='Colorbar')
 945 |             axes.append(cax)
 946 |     else:
 947 |         dax = fig.add_axes([dl/fw, db/fh, aw/fw, dd/fh])
 948 |         ax =  fig.add_axes([dl/fw, (db+dd+ds)/fh, aw/fw, ah/fh])
 949 |         axes = [ax, dax]
 950 |         if colorbar:
 951 |             cax = fig.add_axes([(dl+aw+dc)/fw, (db+dd+ds)/fh, cw/fw, ch/fh])
 952 |             axes.append(cax)
 953 | 
 954 |     if Zx is not None:
 955 |         with plt.rc_context({'lines.linewidth': lw}):
 956 |             z = hierarchy.dendrogram(Zx, ax=dax, count_sort=count_sort, orientation='top', link_color_func=lambda k: 'k')
 957 |         ix = df.columns[z['leaves']]  # equivalent to hierarchy.leaves_list(Zx) if count_sort=False
 958 |     else:
 959 |         ix = df.columns
 960 |     dax.axis('off')
 961 | 
 962 |     if Zy is not None:
 963 |         iy = df.index[hierarchy.leaves_list(Zy)]
 964 |     elif df.index.equals(df.columns):
 965 |         iy = ix
 966 |     else:
 967 |         iy = df.index
 968 | 
 969 |     if dendrogram_pos == 'bottom':
 970 |         dax.invert_yaxis()
 971 | 
 972 |     df = df.loc[iy, ix].copy()
 973 |     if tri:
 974 |         if dendrogram_pos == 'top':
 975 |             if origin == 'upper':
 976 |                 df.values[np.tril_indices(df.shape[0], -1)] = np.nan
 977 |             else:
 978 |                 df.values[np.triu_indices(df.shape[0])] = np.nan
 979 |         elif dendrogram_pos == 'bottom':
 980 |             df.values[np.tril_indices(df.shape[0])] = np.nan
 981 | 
 982 |     if value_labels:
 983 |         irange = np.arange(df.shape[0])
 984 |         jrange = np.arange(df.shape[1])
 985 |         for i in irange:
 986 |             for j in jrange:
 987 |                 if not np.isnan(df.values[j,i]):
 988 |                     ax.text(i, j, f'{df.values[j,i]:.2f}', ha='center', va='center')
 989 | 
 990 |     h = ax.imshow(df.values, origin=origin, cmap=cmap, vmin=vmin, vmax=vmax, norm=norm,
 991 |                   interpolation='none', rasterized=rasterized, aspect='auto')
 992 |     if show_xlabels:
 993 |         ax.set_xticks(np.arange(df.shape[1]))
 994 |         if rotation not in [90, -90]:
 995 |             ax.set_xticklabels(ix, rotation=rotation, rotation_mode='anchor', ha=ha, va=va, fontsize=fontsize)
 996 |         else:
 997 |             ax.set_xticklabels(ix, rotation=rotation, ha=ha, va=va, fontsize=fontsize)
 998 |         ax.tick_params(axis='x', length=tick_length)
 999 |         if show_xlabels == 'top':
1000 |             ax.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False)
1001 |     else:
1002 |         ax.set_xticks([])
1003 | 
1004 |     if show_ylabels:
1005 |         ax.set_yticks(np.arange(df.shape[0]))
1006 |         ax.set_yticklabels(iy, fontsize=fontsize)
1007 |         ax.tick_params(axis='y', length=tick_length)
1008 |     else:
1009 |         ax.set_yticks([])
1010 | 
1011 |     # plot labels
1012 |     for k in range(nr):
1013 |         row_labels[k].plot(ax=lax[k], ix=iy, show_frame=True)
1014 |     for k in range(nc):
1015 |         col_labels[k].plot(ax=tax[k], ix=ix, show_frame=True)
1016 | 
1017 |     if lax:
1018 |         plt.setp(ax.get_yticklabels(), visible=False)
1019 |         for line in ax.yaxis.get_ticklines():
1020 |             line.set_markersize(0)
1021 |             line.set_markeredgewidth(0)
1022 | 
1023 |     if dendrogram_pos == 'bottom':
1024 |         ax.yaxis.tick_right()
1025 |     # else:
1026 |     #     ax.xaxis.tick_top()
1027 | 
1028 |     if label_colors is not None:  # plot color legend
1029 |         s = 1.015
1030 |         xlim = ax.get_xlim()
1031 |         b = xlim[1] - s*np.diff(xlim)
1032 |         ax.set_xlim(xlim)
1033 |         ax.scatter([b]*df.shape[1], np.arange(df.shape[1]), s=40, c=label_colors.iloc[hierarchy.leaves_list(Zx)], clip_on=False)
1034 |         ax.set_yticks(np.arange(df.shape[0]))
1035 |         ax.set_yticklabels(iy, fontsize=fontsize)
1036 |         ax.tick_params(axis='y', pad=12, length=0)
1037 | 
1038 |         # s = 1.02
1039 |         # ylim = ax.get_ylim()
1040 |         # b = ylim[1] - s*np.diff(ylim)
1041 |         # ax.set_ylim(ylim)
1042 |         # ax.scatter(np.arange(df.shape[1]), [b]*df.shape[1], s=36, c=label_colors[hierarchy.leaves_list(Zx)], clip_on=False)
1043 |         # ax.tick_params(axis='x', pad=12)
1044 | 
1045 |     # plot colorbar
1046 |     if colorbar:
1047 |         cbar = plt.colorbar(h, cax=cax, orientation=colorbar_orientation)
1048 |         if norm is None:
1049 |             cax.locator_params(nbins=4)
1050 |         cbar.set_label(clabel, fontsize=cfontsize+1)
1051 |         cax.tick_params(labelsize=cfontsize)
1052 | 
1053 |     if not show_frame:
1054 |         for i in ['left', 'top', 'right', 'bottom']:
1055 |             ax.spines[i].set_visible(False)
1056 | 
1057 |     plt.sca(ax)
1058 |     return axes
1059 | 
1060 | 
1061 | def hexdensity(x, y, bounds=None, bins='log', scale='log',
1062 |                cmap=None, vmin=None, vmax=None, ax=None, cax=None,
1063 |                unit='TPM', entity='genes',
1064 |                gridsize=175, fontsize=12, show_corr=True, clip_on=True, rasterized=False):
1065 |     """Wrapper for hexbin"""
1066 | 
1067 |     if ax is None: # setup new axes
1068 |         ax, cax = setup_figure(2, 2, xspace=[0.75, 1], yspace=[0.75, 0.5], colorbar=True, ch=1, cw=0.12)
1069 |         ax.margins(0.01)
1070 |         format_plot(ax, fontsize=fontsize-2, x_offset=6, y_offset=6)
1071 | 
1072 |     if cmap is None:
1073 |         cmap = copy.copy(plt.cm.RdYlBu_r)
1074 |         cmap.set_bad('w', 1.)
1075 | 
1076 |     rho = scipy.stats.spearmanr(x, y)[0]
1077 |     x = x.copy()
1078 |     y = y.copy()
1079 |     nanidx = (x == 0) | (y == 0)
1080 |     if any(nanidx):
1081 |         x[nanidx] = np.nan
1082 |         y[nanidx] = np.nan
1083 | 
1084 |     h = ax.hexbin(x, y, bins=bins, xscale=scale, yscale=scale, linewidths=0.1,
1085 |                   gridsize=gridsize, cmap=cmap, vmin=vmin, vmax=vmax, mincnt=1, zorder=1,
1086 |                   clip_on=clip_on, rasterized=rasterized)
1087 | 
1088 |     if bounds is None:
1089 |         xlim = ax.get_xlim()
1090 |         ylim = ax.get_ylim()
1091 |         bounds = [np.minimum(xlim[0], ylim[0]), np.maximum(xlim[1], ylim[1])]
1092 |     elif len(bounds) == 2:
1093 |         ax.set_xlim(bounds)
1094 |         ax.set_ylim(bounds)
1095 |     else:
1096 |         ax.set_xlim(bounds[:2])
1097 |         ax.set_ylim(bounds[2:])
1098 | 
1099 |     if show_corr:
1100 |         t = ax.text(1, 0, r'$\rho$ = {:.2f}'.format(rho), transform=ax.transAxes,
1101 |                     ha='right', va='bottom', fontsize=fontsize, zorder=2)
1102 |         t.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1"))
1103 | 
1104 |     if cax is not None:
1105 |         hc = plt.colorbar(h, cax=cax, orientation='vertical', ticks=ticker.LogLocator(numticks=4))
1106 |         hc.set_label('log$\mathregular{_{10}}$('+entity+')', fontsize=fontsize)
1107 | 
1108 |     if isinstance(x, pd.Series):
1109 |         ax.set_xlabel(f'{x.name} ({unit})' if unit is not None else f'{x.name}', fontsize=fontsize)
1110 |     if isinstance(y, pd.Series):
1111 |         ax.set_ylabel(f'{y.name} ({unit})' if unit is not None else f'{y.name}', fontsize=fontsize)
1112 | 
1113 |     return ax, cax
1114 | 


--------------------------------------------------------------------------------
/qtl/sam.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import subprocess
 4 | import io
 5 | 
 6 | 
 7 | 
 8 | 
 9 | def is_stranded(bam_file, paired_end=True, verbose=False):
10 |     """
11 |     Determine whether the sequencing protocol was strand-specific
12 |     based on reads mapping to ACTB(-) and FTL(+)
13 |     """
14 | 
15 |     header = subprocess.check_output(f'samtools view -H {bam_file}', shell=True).decode()
16 |     header = header.strip().split('\n')
17 |     header = [i.split('\t') for i in header if i.startswith('@SQ')]
18 |     c = [i[1].split(':')[1] for i in header]
19 |     if not np.any([i.startswith('chr') for i in c]):  # assume hg19
20 |         plus_str = '19:49468558-49470135'  # FTL
21 |         minus_str = '7:5566782-5603415'  # ACTB
22 |     else:  # hg38
23 |         plus_str = 'chr19:48965301-48966878'  # FTL
24 |         minus_str = 'chr7:5527151-5563784'  # ACTB
25 | 
26 |     if paired_end:
27 |         cmd = f'samtools view -q 255 -f2 -F3840 {bam_file} {plus_str} | cut -f2 | sort | uniq -c'
28 |         with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p:
29 |             s, stderr = p.communicate()
30 |         if stderr == '[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n':
31 |             raise ValueError('BAM/CRAM file must be indexed.')
32 |         dfp = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze()
33 | 
34 |         cmd = f'samtools view -q 255 -f2 -F3840 {bam_file} {minus_str} | cut -f2 | sort | uniq -c'
35 |         s = subprocess.check_output(cmd, shell=True).decode()
36 |         dfm = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze()
37 | 
38 |         dfp = dfp.reindex([147, 99, 83, 163], fill_value=0)
39 |         dfm = dfm.reindex([147, 99, 83, 163], fill_value=0)
40 | 
41 |         p = dfp[[147, 99]].sum() / dfp.sum()
42 |         m = dfm[[163, 83]].sum() / dfm.sum()
43 |     else:
44 |         s = subprocess.check_output(f'samtools view -q 255 {bam_file} {plus_str} | cut -f2 | sort | uniq -c', shell=True).decode()
45 |         dfp = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze()
46 |         s = subprocess.check_output(f'samtools view -q 255 {bam_file} {minus_str} | cut -f2 | sort | uniq -c', shell=True).decode()
47 |         dfm = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze()
48 |         dfp = dfp.reindex([0, 16], fill_value=0)
49 |         dfm = dfm.reindex([0, 16], fill_value=0)
50 |         p = dfp[0].sum() / dfp.sum()
51 |         m = dfm[16].sum() / dfm.sum()
52 | 
53 |     is_stranded = (p<0.02) & (m<0.02)
54 |     if verbose:
55 |         print(f'Total read coverage: {dfp.sum()} (FTL), {dfm.sum()} (ACTB)')
56 |         print(f'Proportion of FTL(+)  reads on -strand: {p:.4g}')
57 |         print(f'Proportion of ACTB(-) reads on +strand: {m:.4g}')
58 |         print(f'Stranded: {is_stranded}')
59 | 
60 |     return is_stranded
61 | 


--------------------------------------------------------------------------------
/qtl/stats.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy.stats as stats
  4 | 
  5 | 
  6 | class Residualizer(object):
  7 |     def __init__(self, C, fail_colinear=False):
  8 |         # center and orthogonalize
  9 |         self.Q, R = np.linalg.qr(C - np.mean(C,0))
 10 |         self.dof = C.shape[0] - 2 - C.shape[1]
 11 | 
 12 |         # check for colinearity
 13 |         colinear_ix = np.abs(np.diag(R)) < np.finfo(np.float64).eps * C.shape[1]
 14 |         if np.any(colinear_ix):
 15 |             if fail_colinear:
 16 |                 raise ValueError("Colinear or zero covariates detected")
 17 |             else:  # drop colinear covariates
 18 |                 print(f'  * dropped colinear covariates: {np.sum(colinear_ix)}')
 19 |                 self.Q = self.Q[:, ~colinear_ix]
 20 | 
 21 |     def transform(self, df, center=False):
 22 |         """Residualize rows of df wrt columns of C"""
 23 |         # transform input
 24 |         if isinstance(df, pd.DataFrame) or isinstance(df, pd.Series):
 25 |             M = df.values
 26 |         else:
 27 |             M = df
 28 | 
 29 |         isvector = False
 30 |         if isinstance(M, list) or (hasattr(M, 'shape') and len(M.shape)==1):
 31 |             M = np.array(M).reshape(1,-1)
 32 |             isvector = True
 33 | 
 34 |         # residualize M relative to C
 35 |         M0 = M - np.mean(M, axis=1, keepdims=True)
 36 |         if center:
 37 |             M0 = M0 - np.dot(np.dot(M0, self.Q), self.Q.T)
 38 |         else:
 39 |             M0 = M -  np.dot(np.dot(M0, self.Q), self.Q.T)  # retain original mean
 40 | 
 41 |         if isvector:
 42 |             M0 = M0[0]
 43 | 
 44 |         if isinstance(df, pd.DataFrame):
 45 |             M0 = pd.DataFrame(M0, index=df.index, columns=df.columns)
 46 |         elif isinstance(df, pd.Series):
 47 |             M0 = pd.Series(M0, index=df.index, name=df.name)
 48 | 
 49 |         return M0
 50 | 
 51 | 
 52 | def residualize(df, C, center=False, fail_colinear=False):
 53 |     r = Residualizer(C, fail_colinear=fail_colinear)
 54 |     return r.transform(df, center=center)
 55 | 
 56 | 
 57 | def center_normalize(x, axis=0):
 58 |     """Center and normalize x"""
 59 |     if isinstance(x, pd.DataFrame):
 60 |         x0 = x - np.mean(x.values, axis=axis, keepdims=True)
 61 |         return x0 / np.sqrt(np.sum(x0.pow(2).values, axis=axis, keepdims=True))
 62 |     elif isinstance(x, pd.Series):
 63 |         x0 = x - x.mean()
 64 |         return x0 / np.sqrt(np.sum(x0*x0))
 65 |     elif isinstance(x, np.ndarray):
 66 |         x0 = x - np.mean(x, axis=axis, keepdims=True)
 67 |         return x0 / np.sqrt(np.sum(x0*x0, axis=axis))
 68 | 
 69 | 
 70 | def padjust_bh(p):
 71 |     """
 72 |     Benjamini-Hochberg adjusted p-values
 73 | 
 74 |     Replicates p.adjust(p, method="BH") from R
 75 |     """
 76 |     n = len(p)
 77 |     i = np.arange(n,0,-1)
 78 |     o = np.argsort(p)[::-1]
 79 |     ro = np.argsort(o)
 80 |     pa = np.minimum(1, np.minimum.accumulate(np.float64(n)/i * np.array(p)[o]))[ro]
 81 |     if isinstance(p, pd.Series):
 82 |         pa = pd.Series(pa, index=p.index)
 83 |     return pa
 84 | 
 85 | 
 86 | def pi0est(p, lambda_param):
 87 |     """
 88 |     pi0 statistic (Storey and Tibshirani, 2003)
 89 | 
 90 |     For fixed values of 'lambda'; equivalent to the qvalue::pi0est
 91 |     from R package qvalue
 92 |     """
 93 |     if np.min(p) < 0 or np.max(p) > 1:
 94 |         raise ValueError("p-values not in valid range [0, 1]")
 95 |     elif np.min(lambda_param) < 0 or np.max(lambda_param) >= 1:
 96 |         raise ValueError("lambda must be within [0, 1)")
 97 | 
 98 |     pi0 = np.mean(p >= lambda_param) / (1 - lambda_param)
 99 |     pi0 = np.minimum(pi0, 1)
100 | 
101 |     if pi0 <= 0:
102 |         raise ValueError("The estimated pi0 <= 0. Check that you have valid p-values or use a different range of lambda.")
103 | 
104 |     return pi0
105 | 
106 | 
107 | def qvalue(p, lambda_param=0.5, pi0=None):
108 |     """
109 |     q-value calculation for fixed 'lambda' from Storey and Tibshirani, 2003.
110 |     """
111 |     if isinstance(p, pd.Series):
112 |         ix = p.index
113 |         p = p.values
114 |     else:
115 |         ix = None
116 | 
117 |     if pi0 is None:
118 |         pi0 = pi0est(p, lambda_param)
119 | 
120 |     u = np.argsort(p)
121 |     m = len(p)
122 |     v = stats.rankdata(p, method='max')  # sort p
123 |     qvals = (pi0 * m * p)/v
124 | 
125 |     qvals[u[m-1]] = np.minimum(qvals[u[m-1]], 1)
126 |     for i in range(m-2, -1, -1):
127 |         qvals[u[i]] = np.minimum(qvals[u[i]], qvals[u[i+1]])
128 | 
129 |     if ix is not None:
130 |         qvals = pd.Series(qvals, index=ix)
131 |     return qvals
132 | 
133 | 
134 | def bootstrap_pi1(pval, lambda_param=0.5, bounds=[2.5, 97.5], n=1000):
135 |     """Compute confidence intervals for pi1 with bootstrapping"""
136 |     pi1_boot = []
137 |     nfail = 0
138 |     for _ in range(n):
139 |         try:
140 |             pi1_boot.append(1 - pi0est(np.random.choice(pval, len(pval), replace=True), lambda_param=lambda_param))
141 |         except:
142 |             nfail += 1
143 |     if nfail > 0:
144 |         print(f'Warning: {nfail} bootstraps failed')
145 |     pi1_boot = np.array(pi1_boot)
146 |     if len(pi1_boot) > 0:
147 |         ci = np.percentile(pi1_boot, bounds)
148 |     else:
149 |         ci = np.array([np.nan, np.nan])
150 |     return ci
151 | 


--------------------------------------------------------------------------------
/qtl/torus.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy.stats as stats
  4 | import os
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.patches as patches
  7 | 
  8 | from . import plot
  9 | 
 10 | torus_dict = {
 11 |     'TF_BINDING_SITE': 'TF binding site',
 12 |     'CTCF_BINDING_SITE': 'CTCF binding site',
 13 |     'INTRON_VARIANT': 'Intron variant',
 14 |     'SYNONYMOUS_VARIANT': 'Synonymous variant',
 15 |     'SPLICE_DONOR_VARIANT': 'Splice donor variant',
 16 |     'NON_CODING_TRANSCRIPT_EXON_VARIANT': 'Non-coding transcript exon variant',
 17 |     'MISSENSE_VARIANT': 'Missense variant',
 18 |     'STOP_GAINED': 'Stop gained',
 19 |     '3_PRIME_UTR_VARIANT': "3' UTR variant",
 20 |     'FRAMESHIFT_VARIANT': 'Frameshift variant',
 21 |     'OPEN_CHROMATIN_REGION': 'Open chromatin region',
 22 |     'SPLICE_REGION_VARIANT': 'Splice region variant',
 23 |     '5_PRIME_UTR_VARIANT': "5' UTR variant",
 24 |     'PROMOTER': 'Promoter',
 25 |     'ENHANCER': 'Enhancer',
 26 |     'PROMOTER_FLANKING_REGION': 'Promoter-flanking region',
 27 |     'SPLICE_ACCEPTOR_VARIANT': 'Splice acceptor variant',
 28 | }
 29 | 
 30 | torus_short_dict = {i:i.replace(' variant', '') for i in torus_dict.values()}
 31 | torus_short_dict['Open chromatin region'] = 'Open chromatin'
 32 | torus_short_dict['Promoter-flanking region'] = 'Promoter-flanking'
 33 | torus_short_dict['Non-coding transcript exon variant'] = 'NC transcript'
 34 | 
 35 | # enhancer_d: Enhancer
 36 | # promoter_d: Promoter
 37 | # open_chromatin_region_d: Open chromatin
 38 | # promoter_flanking_region_d: Promoter-flanking
 39 | # CTCF_binding_site_d: CTCF binding site
 40 | # TF_binding_site_d: TF binding site
 41 | # 3_prime_UTR_variant_d: 3' UTR
 42 | # 5_prime_UTR_variant_d: 5' UTR
 43 | # frameshift_variant_d: Frameshift
 44 | # intron_variant_d: Intron
 45 | # missense_variant_d: Missense
 46 | # non_coding_transcript_exon_variant_d: NC transcript
 47 | # splice_acceptor_variant_d: Splice acceptor
 48 | # splice_donor_variant_d: Splice donor
 49 | # splice_region_variant_d: Splice region
 50 | # stop_gained_d: Stop gained
 51 | # synonymous_variant_d: Synonymous
 52 | 
 53 | 
 54 | def convert_torus(tensorqtl_files, out_file, phenotype_groups_file=None, mode='xQTL'):
 55 |     """Convert tensorQTL parquet files to Torus input format"""
 56 |     if os.path.exists(out_file):
 57 |         raise ValueError('Output file already exists')
 58 |     assert mode in ['xQTL', 'ixQTL']
 59 | 
 60 |     if phenotype_groups_file is not None:
 61 |         group_s = pd.read_csv(phenotype_groups_file, sep='\t', index_col=0, header=None, squeeze=True)
 62 |         group_size_s = group_s.value_counts()
 63 | 
 64 |     if mode=='xQTL':
 65 |         cols = ['phenotype_id', 'variant_id', 'tss_distance', 'pval_nominal', 'slope', 'slope_se']
 66 |     elif mode=='ixQTL':
 67 |         cols = ['phenotype_id', 'variant_id', 'tss_distance', 'pval_gi', 'b_gi', 'b_gi_se']
 68 | 
 69 |     for f in tensorqtl_files:
 70 |         print(f)
 71 |         df = pd.read_parquet(f, columns=cols)
 72 |         df['phenotype_id'] = df['phenotype_id'].apply(lambda x: x.rsplit(':',1)[-1])
 73 |         if phenotype_groups_file is not None:
 74 |             print('  * adjusting p-values by phenotype group size')
 75 |             if mode=='xQTL':
 76 |                 df['pval_nominal'] = np.minimum(df['pval_nominal']*df['phenotype_id'].map(group_size_s), 1.0)
 77 |             elif mode=='ixQTL':
 78 |                 df['pval_gi'] = np.minimum(df['pval_gi']*df['phenotype_id'].map(group_size_s), 1.0)
 79 |         df.to_csv(out_file, sep=' ', float_format='%.6g', compression='gzip', mode='a', index=False, header=None)
 80 | 
 81 | 
 82 | def load(torus_output, log2=True, short_labels=True):
 83 |     torus_df = pd.read_csv(torus_output, sep='\s+', index_col=0, header=None)
 84 |     torus_df.columns = ['mean', 'CI5', 'CI95']
 85 |     torus_df.index.name = 'feature'
 86 |     torus_df.drop('Intercept', axis=0, inplace=True)
 87 |     torus_df = torus_df[~torus_df.index.str.startswith('dtss')]
 88 |     torus_df.index = torus_df.index.map(lambda x: torus_dict.get(x.strip('.1').upper(), x.strip('.1').upper()))
 89 |     if short_labels:
 90 |         torus_df.index = torus_df.index.map(lambda x: torus_short_dict.get(x, x))
 91 |     if log2:
 92 |         torus_df *= np.log2(np.e)
 93 |     return torus_df
 94 | 
 95 | 
 96 | def load_summary(summary_file, log2=True):
 97 |     """Load aggregated output"""
 98 | 
 99 |     torus_df = pd.read_csv(summary_file, sep='\t', index_col=0)
100 |     torus_df = torus_df[~torus_df.index.str.startswith('dtss')]
101 |     torus_df.drop('Intercept', inplace=True)
102 |     torus_df.index = torus_df.index.map(lambda x: torus_dict[x.strip('.1').upper()])
103 | 
104 |     lor_df = torus_df[torus_df.columns[torus_df.columns.str.endswith('lor')]].copy()
105 |     lor_df.columns = lor_df.columns.str.replace('.lor','')
106 | 
107 |     if log2:
108 |         lor_df = np.log2(np.exp(lor_df))
109 | 
110 |     return lor_df
111 | 
112 | 
113 | def test_significance(torus_df1, torus_df2):
114 |     assert np.all(torus_df1.index==torus_df2.index)
115 |     se = (torus_df1['CI95']-torus_df1['CI5'] + torus_df2['CI95']-torus_df2['CI5']) / 3.919927969080108
116 |     mu = torus_df1['mean'] - torus_df2['mean']
117 |     zstat = mu / se
118 |     pval = 2*stats.norm.sf(np.abs(zstat))
119 |     m = pval<0.05/torus_df1.shape[0]
120 |     return pd.DataFrame([pval, m], index=['pval', 'signif_bonferroni'], columns=torus_df1.index).T
121 | 


--------------------------------------------------------------------------------