├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml └── qtl ├── __init__.py ├── annotation.py ├── coloc.py ├── core.py ├── genotype.py ├── gtex.py ├── io.py ├── locusplot.py ├── map.py ├── norm.py ├── pca.py ├── pileup.py ├── plot.py ├── sam.py ├── stats.py └── torus.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.egg-info/ 3 | *.ipynb_checkpoints/ 4 | .DS_Store 5 | __*__ 6 | build/ 7 | dist/ 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, The Broad Institute, Inc. and The General Hospital Corporation. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## pyQTL 2 | 3 | pyQTL is a python module for analyzing and visualizing quantitative trait loci (QTL) data. 4 | 5 | The following functionalities are provided: 6 | * `qtl.annotation`: class for working with gene annotations; includes a [GTF](https://www.gencodegenes.org/pages/data_format.html) parser. 7 | * `qtl.coloc`: Python implementation of core functions from the [R COLOC package](https://github.com/chr1swallace/coloc). 8 | * `qtl.io`: functions for reading/writing BED and GCT files. 9 | * `qtl.locusplot`: functions for generating LocusZoom-style regional association plots. 10 | * `qtl.pileup`: functions for visualizing QTL effects in read pileups from, e.g., RNA-seq data. 11 | * `qtl.plot`: plotting functions for QTLs. 12 | 13 | ### Install 14 | You can install pyQTL using pip: 15 | ``` 16 | pip3 install qtl 17 | ``` 18 | or directly from this repository: 19 | ``` 20 | $ git clone git@github.com:broadinstitute/pyqtl.git 21 | $ cd pyqtl 22 | # set up virtual environment and install 23 | $ virtualenv venv 24 | $ source venv/bin/activate 25 | (venv)$ pip install -e . 26 | ``` 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "qtl" 7 | version = "0.1.10" 8 | dependencies = [ 9 | "biopython", 10 | "bx-python", 11 | "pyBigWig", 12 | "matplotlib", 13 | "numpy", 14 | "pandas", 15 | "scipy", 16 | "seaborn", 17 | ] 18 | authors = [ 19 | {name = "Francois Aguet", email = "francois@broadinstitute.org"} 20 | ] 21 | maintainers = [ 22 | {name = "Francois Aguet", email = "francois@broadinstitute.org"} 23 | ] 24 | description = "Utilities for analyzing and visualizing QTL data" 25 | readme = "README.md" 26 | license = {file = "LICENSE"} 27 | keywords = ["Quantitative trait loci"] 28 | classifiers = [ 29 | "Development Status :: 4 - Beta", 30 | "Programming Language :: Python :: 3", 31 | "Intended Audience :: Science/Research", 32 | "Topic :: Scientific/Engineering :: Bio-Informatics", 33 | ] 34 | 35 | [project.urls] 36 | Repository = "https://github.com/broadinstitute/pyqtl.git" 37 | -------------------------------------------------------------------------------- /qtl/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | __version__ = importlib.metadata.version(__name__) 3 | from .core import * 4 | -------------------------------------------------------------------------------- /qtl/coloc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as stats 4 | from statsmodels.formula.api import ols 5 | import itertools 6 | 7 | # Code adapted from 8 | # https://github.com/chr1swallace/coloc/blob/master/R/claudia.R 9 | 10 | 11 | def var_data(f, N): 12 | """ 13 | Variance of MLE of beta for quantitative trait, assuming var(y) = 1 14 | 15 | Args: 16 | f: minor allele freq 17 | N: sample size 18 | 19 | Returns: 20 | variance of MLE beta 21 | """ 22 | return 1 / (2 * N * f * (1 - f)) 23 | 24 | 25 | def var_data_cc(f, N, s): 26 | """ 27 | Variance of MLE of beta for case-control 28 | 29 | Args: 30 | f: minor allele freq 31 | N: sample size 32 | s: proportion of samples that are cases 33 | 34 | Returns: 35 | variance of MLE beta 36 | """ 37 | return 1 / (2 * N * f * (1 - f) * s * (1 - s)) 38 | 39 | 40 | def logsum(x): 41 | """Computes log(sum(ABF)), where x = log(ABF)""" 42 | mmax = np.max(x) 43 | return mmax + np.log(np.sum(np.exp(x-mmax))) 44 | 45 | 46 | def logdiff(x, y): 47 | """""" 48 | mmax = np.maximum(np.max(x), np.max(y)) 49 | return mmax + np.log(np.exp(x - mmax) - np.exp(y - mmax)) 50 | 51 | 52 | def approx_bf_p(p, f, N, s=None, type='quant'): 53 | """ 54 | Calculate approximate Bayes Factors 55 | 56 | Args: 57 | p: p-value 58 | f: minor allele frequency 59 | N: sample size 60 | s: proportion of samples that are cases 61 | type: 'quant' or 'cc' 62 | 63 | Returns: 64 | Data frame with lABF and intermediate calculations 65 | """ 66 | if type == 'quant': 67 | sd_prior = 0.15 68 | v = var_data(f, N) 69 | else: 70 | sd_prior = 0.2 71 | v = var_data_cc(f, N, s) 72 | z = stats.norm.isf(0.5 * p) 73 | # shrinkage factor: ratio of the prior variance to the total variance 74 | r = sd_prior**2 / (sd_prior**2 + v) 75 | # Approximate BF # I want ln scale to compare in log natural scale with LR diff 76 | labf = 0.5 * (np.log(1 - r) + r*z*z) 77 | return pd.DataFrame({'v':v, 'z':z, 'r':r, 'lABF':labf}) 78 | 79 | 80 | def approx_bf_estimates(z, v, type='quant', sdy=1): 81 | """ 82 | Calculates approximate Bayes Factors using the variance of the regression coefficients 83 | 84 | See eq. (2) in Wakefield, 2009 and Supplementary methods from Giambartolomei et al., 2014. 85 | 86 | Args: 87 | z: normal deviate associated with regression coefficient and its variance (in effect the t-statistic, beta/beta_se) 88 | v: variance of the regression coefficient (beta_se**2) 89 | sdy: standard deviation of the trait 90 | 91 | Returns: 92 | Data frame with lABF and intermediate calculations 93 | """ 94 | if type == 'quant': 95 | sd_prior = 0.15*sdy 96 | else: 97 | sd_prior = 0.2 98 | r = sd_prior**2 / (sd_prior**2 + v) 99 | labf = 0.5 * (np.log(1 - r) + r*z*z) 100 | return pd.DataFrame({'v':v, 'z':z, 'r':r, 'lABF':labf}) 101 | 102 | 103 | def combine_abf(l1, l2, p1=1e-4, p2=1e-4, p12=1e-5, verbose=False): 104 | """ 105 | Calculate posterior probabilities for configurations, given logABFs for each SNP and prior probabilities 106 | 107 | Args: 108 | l1: logABFs for trait 1 109 | l2: logABFs for trait 2 110 | p1: prior probability a SNP is associated with trait 1, default 1e-4 111 | p2: prior probability a SNP is associated with trait 2, default 1e-4 112 | p12: p12 prior probability a SNP is associated with both traits, default 1e-5 113 | 114 | Returns: 115 | pd.Series of posterior probabilities 116 | """ 117 | lsum = l1 + l2 118 | lh0_abf = 0 119 | lh1_abf = np.log(p1) + logsum(l1) 120 | lh2_abf = np.log(p2) + logsum(l2) 121 | lh3_abf = np.log(p1) + np.log(p2) + logdiff(logsum(l1) + logsum(l2), logsum(lsum)) 122 | lh4_abf = np.log(p12) + logsum(lsum) 123 | all_abf = [lh0_abf, lh1_abf, lh2_abf, lh3_abf, lh4_abf] 124 | my_denom_log_abf = logsum(all_abf) # denominator in eq. 2 125 | pp_abf = np.exp(all_abf - my_denom_log_abf) 126 | pp_abf = pd.Series(pp_abf, index=[f'pp_h{i}_abf' for i in range(5)]) 127 | if verbose: 128 | print(pp_abf) 129 | print(f"PP abf for shared variant: {pp_abf['pp_h4_abf']*100:.3f}%") 130 | return pp_abf 131 | 132 | 133 | def process_dataset(df, N=None, sdy=None, type='quant'): 134 | """ 135 | Preprocessing steps, including calculation of approximate Bayes Factors 136 | 137 | Args: 138 | df: data frame with columns: beta, beta_se -or- pval, maf 139 | N: sample size 140 | sdy: standard deviation of the trait. Estimated from the beta_se and MAF if not provided. 141 | 142 | Returns a data frame with the additional columns: 143 | v (beta_se**2), z (z-score), r (shrinkage factor), lABF (log ABF) 144 | """ 145 | if 'beta' in df and 'beta_se' in df: 146 | beta_var = df['beta_se']**2 147 | if sdy is None: 148 | print('WARNING: estimating sdy from the data') 149 | sdy = sdy_est(beta_var, df['maf'], N) 150 | res_df = approx_bf_estimates(df['beta']/df['beta_se'], beta_var, type=type, sdy=sdy) 151 | else: 152 | pval_col = df.columns[df.columns.str.startswith('pval')][0] 153 | res_df = approx_bf_p(df[pval_col], df['maf'], type=type, N=N, s=None) 154 | return df.join(res_df) 155 | 156 | 157 | def sdy_est(vbeta, maf, n): 158 | """ 159 | Estimate trait standard deviation given vectors of variance of coefficients, MAF and sample size 160 | 161 | Estimate is based on var(beta-hat) = var(Y) / (n * var(X)) 162 | var(X) = 2*maf*(1-maf) 163 | so we can estimate var(Y) by regressing n*var(X) against 1/var(beta) 164 | 165 | Args: 166 | vbeta: vector of variance of coefficients 167 | maf: vector of MAF (same length as vbeta) 168 | n: sample size 169 | 170 | Returns: 171 | estimated standard deviation of Y 172 | """ 173 | print('Warning: estimating sdY from MAF and varbeta, provide this if known.') 174 | oneover = 1/vbeta 175 | nvx = 2 * n * maf * (1-maf) # n * var(X) 176 | res = ols('nvx ~ oneover - 1', {'nvx':nvx, 'oneover':oneover}).fit() 177 | cf = res.params[0] 178 | return np.sqrt(cf) 179 | 180 | 181 | def abf(df1, df2, N=None, sdy=None, p1=1e-4, p2=1e-4, p12=1e-5, verbose=False): 182 | """ 183 | 184 | Args: 185 | df1, df2: DataFrames with columns 186 | 'beta' and 'beta_se' -or- 187 | 'pval_nominal' and 'maf' 188 | N: sample size, must be provided if using p-values and MAF 189 | 190 | """ 191 | 192 | if 'sample_size' in df1: 193 | n1 = int(df1['sample_size'].values[0]) 194 | else: 195 | assert N is not None 196 | n1 = N 197 | 198 | if 'sample_size' in df2: 199 | n2 = int(df2['sample_size'].values[0]) 200 | else: 201 | assert N is not None 202 | n2 = N 203 | 204 | if 'p_std' in df1: 205 | sdy1 = float(df1['p_std'].values[0]) 206 | else: 207 | sdy1 = sdy 208 | if 'p_std' in df2: 209 | sdy2 = float(df2['p_std'].values[0]) 210 | else: 211 | sdy2 = sdy 212 | mdf1 = process_dataset(df1, N=n1, sdy=sdy1) 213 | mdf2 = process_dataset(df2, N=n2, sdy=sdy2) 214 | 215 | merged_df = pd.merge(mdf1.reset_index(drop=True), mdf2.reset_index(drop=True), suffixes=('_1', '_2'), left_index=True, right_index=True) 216 | # merged_df = merged_df.sort_values('snp_1') 217 | internal_sum_lABF = merged_df['lABF_1'] + merged_df['lABF_2'] 218 | merged_df['internal_sum_lABF'] = internal_sum_lABF 219 | my_denom_log_abf = logsum(internal_sum_lABF) 220 | merged_df['snp_pp_h4'] = np.exp(internal_sum_lABF - my_denom_log_abf) 221 | pp_abf = combine_abf(mdf1['lABF'], mdf2['lABF'], p1=p1, p2=p2, p12=p12, verbose=verbose) 222 | return pp_abf, merged_df 223 | 224 | 225 | def susie(s1, s2, p1=1e-4, p2=1e-4, p12=5e-6, verbose=False, is_sorted=True): 226 | """ 227 | Colocalisation with multiple causal variants using SuSiE 228 | 229 | s1, s2: outputs from SuSiE 230 | 231 | Note: this function assumes that 'lbf_variable' are indexed by 'cs_index': 232 | res['lbf_variable'] = res['lbf_variable'][res['sets']['cs_index']] 233 | See tensorqtl.susie.map() for additional details. 234 | 235 | """ 236 | cs1 = s1['sets'] 237 | cs2 = s2['sets'] 238 | lbf1 = s1['lbf_variable'] 239 | lbf2 = s2['lbf_variable'] 240 | if not isinstance(lbf1, pd.DataFrame): 241 | lbf1 = pd.DataFrame(lbf1, columns=s1['pip'].index) 242 | if not isinstance(lbf2, pd.DataFrame): 243 | lbf2 = pd.DataFrame(lbf2, columns=s2['pip'].index) 244 | isnps = lbf1.columns[lbf1.columns.isin(lbf2.columns)] 245 | n = len(isnps) 246 | if cs1['cs'] is None or cs2['cs'] is None or len(cs1['cs']) == 0 or len(cs2['cs']) == 0 or n == 0: 247 | return None 248 | if verbose: 249 | print(f"Using {n} shared variants (of {lbf1.shape[1]} and {lbf2.shape[1]})") 250 | idx1 = cs1['cs_index'] 251 | idx2 = cs2['cs_index'] 252 | if not is_sorted: 253 | bf1 = lbf1.loc[idx1, isnps] 254 | bf2 = lbf2.loc[idx2, isnps] 255 | else: 256 | bf1 = lbf1[isnps] 257 | bf2 = lbf2[isnps] 258 | 259 | ret = bf_bf(bf1, bf2, p1=p1, p2=p2, p12=p12) 260 | 261 | ret['summary']['idx1'] = idx1[ret['summary']['idx1']] 262 | ret['summary']['idx2'] = idx2[ret['summary']['idx2']] 263 | # ret$summary[, `:=`(idx1, cs1$cs_index[idx1])] 264 | # ret$summary[, `:=`(idx2, cs2$cs_index[idx2])] 265 | return ret 266 | 267 | 268 | def bf_bf(bf1, bf2, p1=1e-4, p2=1e-4, p12=5e-6, overlap_min=0.5, trim_by_posterior=True, verbose=False): 269 | """Colocalize two datasets represented by Bayes factors""" 270 | if isinstance(bf1, pd.Series): 271 | bf1 = bf1.to_frame().T 272 | if isinstance(bf2, pd.Series): 273 | bf2 = bf2.to_frame().T 274 | 275 | # combinations to test 276 | todo_df = pd.DataFrame(itertools.product(range(len(bf1)), range(len(bf2))), columns=['i', 'j']) 277 | todo_df['pp4'] = 0 278 | 279 | isnps = bf1.columns[bf1.columns.isin(bf2.columns)] 280 | if len(isnps) == 0: 281 | return None 282 | 283 | pp1 = logbf_to_pp(bf1, p1, last_is_null=True) 284 | pp2 = logbf_to_pp(bf2, p2, last_is_null=True) 285 | ph0_1 = 1 - np.sum(pp1, 1) 286 | ph0_2 = 1 - np.sum(pp2, 1) 287 | 288 | prop1 = pp1[isnps].sum(1) / pp1.sum(1) 289 | prop2 = pp2[isnps].sum(1) / pp2.sum(1) 290 | 291 | if trim_by_posterior: 292 | # drop combinations with insufficient overlapping variants 293 | drop = (prop1.values[todo_df['i']] < overlap_min) | (prop2.values[todo_df['j']] < overlap_min) 294 | if all(drop): 295 | print("WARNING: snp overlap too small between datasets: too few snps with high posterior in one trait represented in other") 296 | return None 297 | # return(list(summary = cbind(data.table(nsnps = length(isnps), 298 | # hit1 = colnames(pp1)[apply(pp1, 1, which.max)][todo$i], 299 | # hit2 = colnames(pp2)[apply(pp2, 1, which.max)][todo$j], 300 | # PP.H0.abf = pmin(ph0.1[todo$i], ph0.2[todo$j]), 301 | # PP.H1.abf = NA, PP.H2.abf = NA, PP.H3.abf = NA, 302 | # PP.H4.abf = NA), todo[, .(idx1 = i, idx2 = j)]))) 303 | elif any(drop): 304 | todo_df = todo_df[~drop] 305 | 306 | bf1 = bf1[isnps] 307 | bf2 = bf2[isnps] 308 | 309 | results = [] 310 | PP = [] 311 | for k in range(len(todo_df)): 312 | df = pd.DataFrame({'snp': isnps, 'bf1': bf1.values[todo_df['i'][k]].astype(np.float64), 313 | 'bf2': bf2.values[todo_df['j'][k]].astype(np.float64)}) 314 | df['internal_sum_lABF'] = df['bf1'] + df['bf2'] 315 | df['snp_pp_h4'] = np.exp(df['internal_sum_lABF'] - logsum(df['internal_sum_lABF'])) 316 | pp_abf = combine_abf(df['bf1'], df['bf2'], p1, p2, p12, verbose=verbose) 317 | 318 | PP.append(df['snp_pp_h4']) 319 | if df['snp_pp_h4'].isnull().all(): 320 | df['snp_pp_h4'] = 0 321 | pp_abf = pd.Series([1, 0, 0, 0, 0], index=pp_abf.index, dtype=np.float64) 322 | hit1 = bf1.columns[np.argmax(bf1.values[todo_df['i'][k]])] 323 | # if (is.null(hit1)) { 324 | # hit1 = "-" 325 | # pp.abf[c(1, 3)] = c(0, 1) 326 | # } 327 | hit2 = bf2.columns[np.argmax(bf2.values[todo_df['j'][k]])] 328 | # if (is.null(hit2)) { 329 | # hit2 = "-" 330 | # pp.abf[c(1, 2)] = c(0, 1) 331 | # } 332 | results.append([df.shape[0], hit1, hit2] + pp_abf.tolist()) 333 | results = pd.DataFrame(results, columns=['nsnps', 'hit1', 'hit2'] + pp_abf.index.tolist()) 334 | results = pd.concat([results, todo_df[['i','j']].rename(columns={'i':'idx1', 'j':'idx2'})], axis=1) 335 | PP = pd.DataFrame(PP).T 336 | if len(todo_df) > 1: 337 | PP.columns = [f"snp_pp_h4_row{i}" for i in range(len(todo_df))] 338 | else: 339 | PP.columns = ["snp_pp_h4_abf"] 340 | 341 | m = results[['hit1', 'hit2']].duplicated() 342 | if any(m): 343 | results = results[~m] 344 | PP = PP[PP.columns[~m]] 345 | 346 | PP = pd.concat([pd.Series(isnps, name='snp'), PP], axis=1) 347 | return {'summary': results, 'results': PP, 'priors': pd.Series({'p1':p1, 'p2':p2, 'p12':p12})} 348 | 349 | 350 | def logbf_to_pp(bf, pi, last_is_null=True): 351 | """ 352 | Convert logBF matrix to PP matrix 353 | bf: Bayes Factors --- L by p or p+1 matrix? 354 | pi: prior probability 355 | last_is_null: True if the last value of the BF matrix corresponds to the null hypythesis of no associations. 356 | """ 357 | if isinstance(bf, pd.DataFrame): 358 | cols = bf.columns 359 | index = bf.index 360 | bf = bf.values.copy() 361 | else: 362 | cols = None 363 | bf = bf.copy() 364 | 365 | n = bf.shape[1] 366 | if last_is_null: 367 | n -= 1 368 | if np.ndim(pi) == 0: 369 | if pi > 1/n: 370 | pi = 1/n 371 | if last_is_null: 372 | pi = np.r_[np.full(n, pi), 1-n*pi] 373 | else: 374 | pi = np.full(n, pi) 375 | m = pi == 0 376 | if any(m): 377 | pi[m] = 1e-16 378 | pi /= np.sum(pi) 379 | if last_is_null: 380 | bf -= bf[:, [-1]] 381 | priors = np.tile(np.log(pi), [bf.shape[0], 1]) 382 | 383 | x = bf + priors 384 | mmax = np.max(x, 1, keepdims=True) 385 | denom = mmax + np.log(np.sum(np.exp(x - mmax), 1, keepdims=True)) 386 | pp = np.exp(bf + priors - denom) 387 | if cols is not None: 388 | pp = pd.DataFrame(pp, columns=cols, index=index) 389 | return pp 390 | -------------------------------------------------------------------------------- /qtl/core.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | 4 | 5 | def check_dependency(name): 6 | """""" 7 | e = subprocess.call(f"which {name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 8 | if e != 0: 9 | raise RuntimeError(f"External dependency '{name}' not installed") 10 | 11 | 12 | def refresh_gcs_token(): 13 | """""" 14 | t = subprocess.check_output('gcloud auth application-default print-access-token', 15 | shell=True).decode().strip() 16 | os.putenv('GCS_OAUTH_TOKEN', t) 17 | -------------------------------------------------------------------------------- /qtl/genotype.py: -------------------------------------------------------------------------------- 1 | # Author: Francois Aguet 2 | import numpy as np 3 | import pandas as pd 4 | import gzip 5 | import subprocess 6 | import os 7 | import tempfile 8 | 9 | MISSING = -9 # PLINK2 convention 10 | gt_dosage_dict = {'0/0': 0, '0/1': 1, '1/1': 2, './.': MISSING, 11 | '0|0': 0, '0|1': 1, '1|0': 1, '1|1': 2, '.|.': MISSING} 12 | 13 | class GenotypeIndexer(object): 14 | def __init__(self, genotype_df, variant_df, sample_ids=None): 15 | self.genotype_df = genotype_df 16 | self.index_dict = {j:i for i,j in enumerate(variant_df.index)} 17 | self.variant_df = variant_df.copy() 18 | self.variant_df['index'] = np.arange(variant_df.shape[0]) 19 | self.chr_variant_dfs = {c:g[['pos', 'index']] for c,g in self.variant_df.groupby('chrom')} 20 | if sample_ids is None: 21 | self.sample_ids = genotype_df.columns 22 | self.sample_ix = np.arange(genotype_df.shape[1]) 23 | else: 24 | self.sample_ids = sample_ids 25 | self.sample_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids]) 26 | 27 | def set_sample_ids(self, sample_ids): 28 | self.sample_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids]) 29 | 30 | def get_indexes(self, variant_ids): 31 | return [self.index_dict[i] for i in variant_ids] 32 | 33 | def get_genotype(self, variant_id): 34 | return self.genotype_df.values[self.index_dict[variant_id], self.sample_ix] 35 | 36 | def get_genotypes(self, variant_ids): 37 | return self.genotype_df.values[[self.index_dict[i] for i in variant_ids]][:, self.sample_ix] 38 | 39 | def get_genotype_window(self, region_str): 40 | chrom, pos = region_str.split(':') 41 | start, end = pos.split('-') 42 | lb = np.searchsorted(self.chr_variant_dfs[chrom]['pos'].values, int(start)) 43 | ub = np.searchsorted(self.chr_variant_dfs[chrom]['pos'].values, int(end), side='right') 44 | ub = np.minimum(ub, self.chr_variant_dfs[chrom].shape[0]-1) 45 | lb = self.chr_variant_dfs[chrom]['index'][lb] 46 | ub = self.chr_variant_dfs[chrom]['index'][ub] 47 | return self.genotype_df.iloc[lb:ub][self.sample_ids] 48 | 49 | 50 | def get_sample_ids(vcf): 51 | """Get sample IDs""" 52 | if vcf.endswith('.bcf'): 53 | return subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split('\n') 54 | else: 55 | with gzip.open(vcf, 'rt') as f: 56 | for line in f: 57 | if line[:2]=='##': continue 58 | break 59 | return line.strip().split('\t')[9:] 60 | 61 | 62 | def get_contigs(vcfpath): 63 | """Get list of contigs""" 64 | chrs = subprocess.check_output('tabix --list-chroms '+vcfpath, shell=True, executable='/bin/bash') 65 | return chrs.decode().strip().split() 66 | 67 | 68 | def get_variant_ids(vcf): 69 | """Get list of variant IDs ('ID' field)""" 70 | s = subprocess.check_output(f'zcat {vcf} | grep -v "#" | cut -f3', shell=True) 71 | return s.strip(b'\n').split(b'\n') 72 | 73 | 74 | def get_cis_genotypes(chrom, tss, vcf, field='GT', dosages=True, window=1000000): 75 | """Get genotypes in cis window (using tabix)""" 76 | region_str = chrom+':'+str(np.maximum(tss-window, 1))+'-'+str(tss+window) 77 | return get_genotypes_region(vcf, region_str, field=field, dosages=dosages) 78 | 79 | 80 | def get_genotypes_region(vcf, region, field='GT', dosages=True): 81 | """Get genotypes, using region (chr:start-end) string""" 82 | s = subprocess.check_output(f'tabix {vcf} {region}', 83 | shell=True, executable='/bin/bash') 84 | s = s.decode().strip() 85 | if len(s) == 0: 86 | return None 87 | # raise ValueError(f'No variants in region {region}') 88 | s = s .split('\n') 89 | variant_ids = [si.split('\t', 3)[-2] for si in s] 90 | field_ix = s[0].split('\t')[8].split(':').index(field) 91 | 92 | if dosages: 93 | if field == 'GT': 94 | s = [[gt_dosage_dict[i.split(':', field_ix+1)[field_ix]] for i in si.split('\t')[9:]] for si in s] 95 | elif field == 'DS': 96 | s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s] 97 | dtype = np.float32 98 | else: 99 | s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s] 100 | dtype = str 101 | 102 | return pd.DataFrame(data=s, index=variant_ids, columns=get_sample_ids(vcf), dtype=dtype) 103 | 104 | 105 | def impute_mean(df, missing=lambda x: np.isnan(x), verbose=True): 106 | """Row-wise mean imputation (in place). Missing values: np.nan by default.""" 107 | if isinstance(df, pd.DataFrame): 108 | genotypes = df.values 109 | else: 110 | genotypes = df 111 | 112 | n = 0 113 | for k,g in enumerate(genotypes,1): 114 | ix = missing(g) 115 | if np.any(ix): 116 | g[ix] = np.mean(g[~ix]) 117 | n += 1 118 | 119 | if verbose and n > 0: 120 | print(f' * imputed at least 1 sample in {n} sites') 121 | 122 | 123 | def get_genotype(variant_id, vcf, field='GT', convert_gt=True, sample_ids=None): 124 | """ 125 | Parse genotypes for given variant from VCF. Requires tabix. 126 | 127 | variant_id: {chr}_{pos}_{ref}_{alt}_{build} 128 | vcf: vcf path 129 | field: GT or DS 130 | convert_gt: convert GT to dosages 131 | sample_ids: VCF sample IDs 132 | """ 133 | 134 | chrom, pos = variant_id.split('_')[:2] 135 | s = subprocess.check_output(f"tabix {vcf} {chrom}:{pos}-{pos}", shell=True) 136 | if len(s) == 0: 137 | raise ValueError(f"Variant '{variant_id}' not found in VCF.") 138 | 139 | s = s.decode().strip() 140 | if '\n' in s: 141 | s = s.split('\n') 142 | try: 143 | s = s[np.nonzero(np.array([i.split('\t',3)[-2] for i in s]) == variant_id)[0][0]] 144 | except: 145 | raise ValueError("Variant ID not found in VCF.") 146 | s = s.split('\t') 147 | fmt = s[8].split(':') 148 | 149 | if field == 'DS': 150 | if 'DS' in fmt: 151 | ds_ix = fmt.index('DS') 152 | s = np.array([np.float32(i.split(':')[ds_ix]) for i in s[9:]]) # dosages 153 | else: 154 | raise ValueError('No dosage (DS) values found in VCF.') 155 | # check format: use GT if DS not present 156 | else: 157 | assert fmt[0] == 'GT' 158 | s = [i.split(':', 1)[0] for i in s[9:]] 159 | 160 | if convert_gt: 161 | s = np.float32([gt_dosage_dict[i] for i in s]) 162 | 163 | if sample_ids is None: 164 | sample_ids = get_sample_ids(vcf) 165 | s = pd.Series(s, index=sample_ids, name=variant_id) 166 | 167 | return s 168 | 169 | 170 | def get_genotypes(variant_ids, vcf, field='GT', drop_duplicates=True): 171 | """""" 172 | 173 | variant_id_set = set(variant_ids) 174 | 175 | with tempfile.NamedTemporaryFile() as regions_file: 176 | df = pd.DataFrame([i.split('_')[:2] for i in variant_id_set], columns=['chr', 'pos']) 177 | df['pos'] = df['pos'].astype(int) 178 | df = df.sort_values(['chr', 'pos']) 179 | df.to_csv(regions_file.name, sep='\t', index=False, header=False) 180 | s = subprocess.check_output(f'tabix {vcf} --regions {regions_file.name}', shell=True) 181 | 182 | s = s.decode().strip().split('\n') 183 | s = [i.split('\t') for i in s] 184 | variant_ids2 = [i[2] for i in s] 185 | if field == 'GT': 186 | gt_ix = s[0][8].split(':').index('GT') 187 | dosages = [[gt_dosage_dict[j.split(':')[gt_ix]] for j in i[9:]] for i in s] 188 | elif field == 'DS': 189 | ds_ix = s[0][8].split(':').index('DS') 190 | dosages = np.float32([[j.split(':')[ds_ix] for j in i[9:]] for i in s]) 191 | df = pd.DataFrame(dosages, index=variant_ids2, columns=get_sample_ids(vcf)) 192 | df = df[df.index.isin(variant_id_set)] 193 | if drop_duplicates: 194 | df = df[~df.index.duplicated()] 195 | return df 196 | 197 | 198 | def get_allele_stats(genotype_df): 199 | """Returns allele frequency, minor allele samples, and minor allele counts (row-wise).""" 200 | # allele frequency 201 | n2 = 2 * genotype_df.shape[1] 202 | af = genotype_df.sum(1) / n2 203 | # minor allele samples and counts 204 | ix = af <= 0.5 205 | m = genotype_df > 0.5 206 | a = m.sum(1) 207 | b = (genotype_df < 1.5).sum(1) 208 | ma_samples = np.where(ix, a, b) 209 | a = (genotype_df * m).sum(1).astype(int) 210 | ma_count = np.where(ix, a, n2-a) 211 | return af, ma_samples, ma_count 212 | 213 | 214 | def load_vcf(vcf, field='GT', dtype=None, verbose=False): 215 | """Load VCF as DataFrame""" 216 | 217 | sample_ids = subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split() 218 | n_samples = len(sample_ids) 219 | n_variants = int(subprocess.check_output(f'bcftools index -n {vcf}', shell=True).decode()) 220 | 221 | if dtype is None: 222 | if field == 'GT': 223 | dtype = np.int8 224 | elif field == 'DS': 225 | dtype = np.float32 226 | dosages = np.zeros([n_variants, n_samples], dtype=dtype) 227 | 228 | variant_ids = [] 229 | with gzip.open(vcf, 'rt') as f: 230 | for line in f: 231 | if line.startswith('#'): continue # skip header lines 232 | break 233 | 234 | # parse format from first line 235 | line = line.strip().split('\t') 236 | if field not in line[8]: 237 | raise ValueError(f"FORMAT does not include {field}. Available fields: {', '.join(line[8].split(':'))}") 238 | format_ix = line[8].split(':').index(field) 239 | variant_ids.append(line[2]) 240 | if field == 'GT': 241 | dosages[0,:] = [gt_dosage_dict.get(i.split(':')[format_ix], MISSING) for i in line[9:]] 242 | elif field == 'DS': 243 | d = [i.split(':')[format_ix] for i in line[9:]] 244 | d = [dtype(i) if i != '.' else dtype(MISSING) for i in d] 245 | dosages[0,:] = d 246 | 247 | for k,line in enumerate(f, 1): 248 | line = line.strip().split('\t') 249 | variant_ids.append(line[2]) 250 | if field == 'GT': 251 | dosages[k,:] = [gt_dosage_dict.get(i.split(':')[format_ix], MISSING) for i in line[9:]] 252 | elif field == 'DS': 253 | d = [i.split(':')[format_ix] for i in line[9:]] 254 | d = [dtype(i) if i != '.' else dtype(MISSING) for i in d] 255 | dosages[k,:] = d # array? 256 | if verbose and ((k+1) % 1000 == 0 or k+1 == n_variants): 257 | print(f'\rVariants parsed: {k+1:,}', end='' if k+1 < n_variants else None) 258 | 259 | return pd.DataFrame(dosages, index=variant_ids, columns=sample_ids) 260 | -------------------------------------------------------------------------------- /qtl/gtex.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import json 4 | import urllib 5 | import ssl 6 | from collections.abc import Iterable 7 | from matplotlib.colors import hsv_to_rgb, to_hex 8 | 9 | 10 | def s2d(x): 11 | """Parse donor ID from sample ID""" 12 | if isinstance(x, str): 13 | return '-'.join(x.split('-')[:2]) 14 | elif isinstance(x, Iterable): 15 | return ['-'.join(i.split('-')[:2]) for i in x] 16 | 17 | 18 | def get_tissue_id(t): 19 | """Convert tissue name to tissue ID""" 20 | if isinstance(t, str): 21 | return t.replace('(','').replace(')','').replace(' - ', ' ').replace(' ', '_') 22 | elif isinstance(t, Iterable): 23 | return [i.replace('(','').replace(')','').replace(' - ', ' ').replace(' ', '_') for i in t] 24 | 25 | 26 | def _get_api_data(): 27 | context = ssl.create_default_context() 28 | context.check_hostname = False 29 | context.verify_mode = ssl.CERT_NONE 30 | tissues_json = json.loads(urllib.request.urlopen('https://gtexportal.org/api/v2/dataset/tissueSiteDetail', 31 | context=context).read().decode())['data'] 32 | return tissues_json 33 | 34 | 35 | def get_colors_df(diff_brain=False): 36 | """Return pd.DataFrame mapping tissue IDs to colors""" 37 | tissues_json = _get_api_data() 38 | colors_df = pd.DataFrame(tissues_json).rename(columns={ 39 | 'tissueSiteDetailId':'tissue_id', 40 | 'colorHex':'color_hex', 41 | 'colorRgb':'color_rgb', 42 | 'tissueSiteDetail':'tissue_site_detail', 43 | 'tissueSiteDetailAbbr':'tissue_abbrv', 44 | 'tissueSite':'tissue_site', 45 | 'ontologyId':'ontology_id', 46 | }).set_index('tissue_id') 47 | colors_df = colors_df[['tissue_site', 'tissue_site_detail', 'tissue_abbrv', 'ontology_id', 'color_rgb', 'color_hex']] 48 | colors_df['color_hex'] = '#' + colors_df['color_hex'] 49 | if diff_brain: 50 | rgb_s = pd.Series({ 51 | 'Brain_Amygdala': hsv_to_rgb([ 0.1, 1., 0.933]), 52 | 'Brain_Anterior_cingulate_cortex_BA24': hsv_to_rgb([ 0.11, 1., 0.933]), 53 | 'Brain_Caudate_basal_ganglia': hsv_to_rgb([ 0.12, 1., 0.933]), 54 | 'Brain_Cerebellar_Hemisphere': hsv_to_rgb([ 0.13, 1., 0.933]), 55 | 'Brain_Cerebellum': hsv_to_rgb([ 0.13, 1., 0.933]), 56 | 'Brain_Cortex': hsv_to_rgb([ 0.14, 1., 0.933]), 57 | 'Brain_Frontal_Cortex_BA9': hsv_to_rgb([ 0.14, 1., 0.933]), 58 | 'Brain_Hippocampus': hsv_to_rgb([ 0.15, 1., 0.933]), 59 | 'Brain_Hypothalamus': hsv_to_rgb([ 0.16, 1., 0.933]), 60 | 'Brain_Nucleus_accumbens_basal_ganglia': hsv_to_rgb([ 0.17, 1., 0.933]), 61 | 'Brain_Putamen_basal_ganglia': hsv_to_rgb([ 0.18, 1., 0.933]), 62 | 'Brain_Spinal_cord_cervical_c-1': hsv_to_rgb([ 0.19, 1., 0.933]), 63 | 'Brain_Substantia_nigra': hsv_to_rgb([ 0.2, 1., 0.933]), 64 | }) 65 | brain_tissues = [i for i in sorted(colors_df.index) if i.startswith('Brain')] 66 | colors_df.loc[brain_tissues, 'color_hex'] = rgb_s[brain_tissues].apply(lambda x: to_hex(x).upper()) 67 | colors_df.loc[brain_tissues, 'color_rgb'] = rgb_s[brain_tissues].apply( 68 | lambda x: ','.join(np.round(x*255).astype(int).astype(str))) 69 | 70 | colors_df.index.name = 'tissue_id' 71 | colors_df.insert(3, 'tissue_title', colors_df['tissue_site_detail'].map(tissue_title_map)) 72 | return colors_df 73 | 74 | 75 | # Simplified tissue names for figures 76 | tissue_title_map = { 77 | 'Adipose - Subcutaneous': 'Subcutaneous adipose', 78 | 'Adipose - Visceral (Omentum)': 'Visceral omentum', 79 | 'Adrenal Gland': 'Adrenal gland', 80 | 'Artery - Aorta': 'Aorta', 81 | 'Artery - Coronary': 'Coronary artery', 82 | 'Artery - Tibial': 'Tibial artery', 83 | 'Bladder': 'Bladder', 84 | 'Brain - Amygdala': 'Amygdala', 85 | 'Brain - Anterior cingulate cortex (BA24)': 'Anterior cingulate cortex', 86 | 'Brain - Caudate (basal ganglia)': 'Caudate (basal ganglia)', 87 | 'Brain - Cerebellar Hemisphere': 'Cerebellar hemisphere', 88 | 'Brain - Cerebellum': 'Cerebellum', 89 | 'Brain - Cortex': 'Cortex', 90 | 'Brain - Frontal Cortex (BA9)': 'Frontal cortex (BA9)', 91 | 'Brain - Hippocampus': 'Hippocampus', 92 | 'Brain - Hypothalamus': 'Hypothalamus', 93 | 'Brain - Nucleus accumbens (basal ganglia)': 'Nucleus accumbens (basal ganglia)', 94 | 'Brain - Putamen (basal ganglia)': 'Putamen (basal ganglia)', 95 | 'Brain - Spinal cord (cervical c-1)': 'Spinal cord (cervical c-1)', 96 | 'Brain - Substantia nigra': 'Substantia nigra', 97 | 'Breast - Mammary Tissue': 'Breast mammary tissue', 98 | 'Cells - EBV-transformed lymphocytes': 'EBV-transformed lymphocytes', 99 | 'Cells - Cultured fibroblasts': 'Cultured fibroblasts', 100 | 'Cervix - Ectocervix': 'Ectocervix', 101 | 'Cervix - Endocervix': 'Endocervix', 102 | 'Colon - Sigmoid': 'Sigmoid colon', 103 | 'Colon - Transverse': 'Transverse colon', 104 | 'Esophagus - Gastroesophageal Junction': 'Gastroesophageal junction', 105 | 'Esophagus - Mucosa': 'Esophagus mucosa', 106 | 'Esophagus - Muscularis': 'Esophagus muscularis', 107 | 'Fallopian Tube': 'Fallopian tube', 108 | 'Heart - Atrial Appendage': 'Atrial appendage', 109 | 'Heart - Left Ventricle': 'Left ventricle', 110 | 'Kidney - Cortex': 'Kidney cortex', 111 | 'Kidney - Medulla': 'Kidney medulla', 112 | 'Liver': 'Liver', 113 | 'Lung': 'Lung', 114 | 'Minor Salivary Gland': 'Minor salivary gland', 115 | 'Muscle - Skeletal': 'Skeletal muscle', 116 | 'Nerve - Tibial': 'Tibial nerve', 117 | 'Ovary': 'Ovary', 118 | 'Pancreas': 'Pancreas', 119 | 'Pituitary': 'Pituitary', 120 | 'Prostate': 'Prostate', 121 | 'Skin - Not Sun Exposed (Suprapubic)': 'Not sun-exposed skin (suprapubic)', 122 | 'Skin - Sun Exposed (Lower leg)': 'Sun-exposed skin (lower leg)', 123 | 'Small Intestine - Terminal Ileum': 'Small intestine terminal ileum', 124 | 'Spleen': 'Spleen', 125 | 'Stomach': 'Stomach', 126 | 'Testis': 'Testis', 127 | 'Thyroid': 'Thyroid', 128 | 'Uterus': 'Uterus', 129 | 'Vagina': 'Vagina', 130 | 'Whole Blood': 'Whole blood', 131 | } 132 | 133 | 134 | entex_tissue_map = { 135 | "Peyer's patch": 'Small Intestine - Terminal Ileum', 136 | 'adrenal gland': 'Adrenal Gland', 137 | 'ascending aorta': 'Artery - Aorta', # correct mapping? 138 | 'body of pancreas': 'Pancreas', 139 | 'breast epithelium': 'Breast - Mammary Tissue', 140 | 'coronary artery': 'Artery - Coronary', 141 | 'esophagus muscularis mucosa': 'Esophagus - Muscularis', 142 | 'esophagus squamous epithelium': 'Esophagus - Mucosa', 143 | 'gastrocnemius medialis': 'Muscle - Skeletal', 144 | 'gastroesophageal sphincter': 'Esophagus - Gastroesophageal Junction', 145 | 'heart left ventricle': 'Heart - Left Ventricle', 146 | 'lower leg skin': 'Skin - Sun Exposed (Lower leg)', 147 | 'omental fat pad': 'Adipose - Visceral (Omentum)', 148 | 'ovary': 'Ovary', 149 | 'prostate gland': 'Prostate', 150 | 'right atrium auricular region': 'Heart - Atrial Appendage', 151 | 'right lobe of liver': 'Liver', 152 | 'sigmoid colon': 'Colon - Sigmoid', 153 | 'spleen': 'Spleen', 154 | 'stomach': 'Stomach', 155 | 'subcutaneous adipose tissue': 'Adipose - Subcutaneous', 156 | 'suprapubic skin': 'Skin - Not Sun Exposed (Suprapubic)', 157 | 'testis': 'Testis', 158 | 'thoracic aorta': 'Artery - Aorta', # correct mapping? 159 | 'thyroid gland': 'Thyroid', 160 | 'tibial artery': 'Artery - Tibial', 161 | 'tibial nerve': 'Nerve - Tibial', 162 | 'transverse colon': 'Colon - Transverse', 163 | 'upper lobe of left lung': 'Lung', 164 | 'uterus': 'Uterus', 165 | 'vagina': 'Vagina' 166 | } 167 | -------------------------------------------------------------------------------- /qtl/io.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import defaultdict 4 | import subprocess 5 | import gzip 6 | 7 | 8 | def to_bgzip(df, path, header=True, float_format=None): 9 | """Write DataFrame to bgzip""" 10 | assert path.endswith('.gz') 11 | bgzip = subprocess.Popen(f"bgzip -c > {path}", stdin=subprocess.PIPE, shell=True, encoding='utf8') 12 | df.to_csv(bgzip.stdin, sep='\t', index=False, header=header, float_format=float_format) 13 | stdout, stderr = bgzip.communicate() 14 | subprocess.check_call(f"tabix -f {path}", shell=True) 15 | 16 | 17 | def sort_bed(bed_df, inplace=True): 18 | """Sort BED DataFrame""" 19 | sorted_df = bed_df.sort_values(['chr', 'start', 'end'], key=lambda x: 20 | x.str.replace('chr','').str.replace('X','23').astype(int) if x.dtype == object else x, 21 | inplace=inplace) 22 | if inplace: 23 | bed_df.reset_index(drop=True, inplace=True) 24 | else: 25 | sorted_df.reset_index(drop=True, inplace=True) 26 | return sorted_df 27 | 28 | 29 | def write_bed(bed_df, output_name, header=True, float_format=None): 30 | """Write DataFrame to BED format""" 31 | if header: 32 | assert (bed_df.columns[0] == 'chr' or bed_df.columns[0] == '#chr') and bed_df.columns[1] == 'start' and bed_df.columns[2] == 'end' 33 | # header must be commented in BED format 34 | header = bed_df.columns.values.copy() 35 | header[0] = '#chr' 36 | to_bgzip(bed_df, output_name, header=header, float_format=float_format) 37 | 38 | 39 | def read_gct(gct_file, sample_ids=None, dtype=None, load_description=True, skiprows=2): 40 | """Load GCT as DataFrame""" 41 | if sample_ids is not None: 42 | sample_ids = ['Name'] + list(sample_ids) 43 | 44 | if gct_file.endswith('.gct.gz') or gct_file.endswith('.gct'): 45 | if dtype is not None: 46 | with gzip.open(gct_file, 'rt') as gct: 47 | for _ in range(skiprows): 48 | gct.readline() 49 | sample_ids = gct.readline().strip().split() 50 | dtypes = {i:dtype for i in sample_ids[2:]} 51 | dtypes['Name'] = str 52 | dtypes['Description'] = str 53 | df = pd.read_csv(gct_file, sep='\t', skiprows=skiprows, usecols=sample_ids, index_col=0, dtype=dtypes) 54 | else: 55 | df = pd.read_csv(gct_file, sep='\t', skiprows=skiprows, usecols=sample_ids, index_col=0) 56 | elif gct_file.endswith('.parquet'): 57 | df = pd.read_parquet(gct_file, columns=sample_ids) 58 | else: 59 | raise ValueError('Unsupported input format.') 60 | if not load_description and 'Description' in df.columns: 61 | df.drop('Description', axis=1, inplace=True) 62 | return df 63 | 64 | 65 | def write_gct(df, gct_file, float_format='%.6g', compresslevel=6): 66 | """Write DataFrame to GCT format""" 67 | assert df.index.name == 'Name' and df.columns[0] == 'Description' 68 | if gct_file.endswith('.gct.gz'): 69 | opener = gzip.open(gct_file, 'wt', compresslevel=compresslevel) 70 | else: 71 | opener = open(gct_file, 'w') 72 | 73 | with opener as gct: 74 | gct.write(f'#1.2\n{df.shape[0]:d}\t{df.shape[1]-1:d}\n') 75 | df.to_csv(gct, sep='\t', float_format=float_format) 76 | 77 | 78 | def gtf_to_tss_bed(annotation_gtf, feature='gene', exclude_chrs=[], phenotype_id='gene_id'): 79 | """Parse genes and TSSs from GTF and return DataFrame for BED output""" 80 | chrom = [] 81 | start = [] 82 | end = [] 83 | gene_id = [] 84 | gene_name = [] 85 | 86 | if annotation_gtf.endswith('.gz'): 87 | opener = gzip.open(annotation_gtf, 'rt') 88 | else: 89 | opener = open(annotation_gtf, 'r') 90 | 91 | with opener as gtf: 92 | for row in gtf: 93 | row = row.strip().split('\t') 94 | if row[0][0] == '#' or row[2] != feature: continue # skip header 95 | chrom.append(row[0]) 96 | 97 | # TSS: gene start (0-based coordinates for BED) 98 | if row[6] == '+': 99 | start.append(np.int64(row[3])-1) 100 | end.append(np.int64(row[3])) 101 | elif row[6] == '-': 102 | start.append(np.int64(row[4])-1) # last base of gene 103 | end.append(np.int64(row[4])) 104 | else: 105 | raise ValueError('Strand not specified.') 106 | 107 | attributes = defaultdict() 108 | for a in row[8].replace('"', '').split(';')[:-1]: 109 | kv = a.strip().split(' ') 110 | if kv[0]!='tag': 111 | attributes[kv[0]] = kv[1] 112 | else: 113 | attributes.setdefault('tags', []).append(kv[1]) 114 | 115 | gene_id.append(attributes['gene_id']) 116 | gene_name.append(attributes['gene_name']) 117 | 118 | if phenotype_id == 'gene_id': 119 | bed_df = pd.DataFrame(data={'chr':chrom, 'start':start, 'end':end, 'gene_id':gene_id}, columns=['chr', 'start', 'end', 'gene_id'], index=gene_id) 120 | elif phenotype_id == 'gene_name': 121 | bed_df = pd.DataFrame(data={'chr':chrom, 'start':start, 'end':end, 'gene_id':gene_name}, columns=['chr', 'start', 'end', 'gene_id'], index=gene_name) 122 | # drop rows corresponding to excluded chromosomes 123 | mask = np.ones(len(chrom), dtype=bool) 124 | for k in exclude_chrs: 125 | mask = mask & (bed_df['chr']!=k) 126 | bed_df = bed_df[mask] 127 | 128 | # sort by start position 129 | bed_df = bed_df.groupby('chr', sort=False, group_keys=False).apply(lambda x: x.sort_values('start')) 130 | 131 | return bed_df 132 | -------------------------------------------------------------------------------- /qtl/locusplot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """locusplot.py: LocusZoom-style visualization of the p-value landscape for multiple QTL or GWAS""" 4 | 5 | __author__ = "Francois Aguet" 6 | __copyright__ = "Copyright 2019, The Broad Institute" 7 | __license__ = "BSD3" 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib as mpl 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | import matplotlib.patches as patches 15 | from cycler import cycler 16 | import seaborn as sns 17 | import argparse 18 | import subprocess 19 | import os 20 | import io 21 | import gzip 22 | import re 23 | from collections.abc import Iterable 24 | 25 | from . import annotation 26 | from . import genotype as gt 27 | from . import plot 28 | 29 | 30 | def get_sample_ids(vcf): 31 | """Get sample IDs from VCF""" 32 | if vcf.endswith('.bcf'): 33 | return subprocess.check_output(f'bcftools query -l {vcf}', shell=True).decode().strip().split('\n') 34 | else: 35 | with gzip.open(vcf, 'rt') as f: 36 | for line in f: 37 | if line[:2] == '##': continue 38 | break 39 | return line.strip().split('\t')[9:] 40 | 41 | 42 | def get_cis_genotypes(chrom, tss, vcf, field='GT', window=1000000): 43 | """Get dosages from VCF (using tabix)""" 44 | region_str = chrom+':'+str(np.maximum(tss-window, 1))+'-'+str(tss+window) 45 | return get_genotypes_region(vcf, region_str, field=field) 46 | 47 | 48 | def get_genotypes_region(vcf, region, field='GT'): 49 | """Get dosages from VCF (using tabix)""" 50 | print(f'Getting {field} for region {region}') 51 | cmd = 'tabix '+vcf+' '+region 52 | s = subprocess.check_output(cmd, shell=True, executable='/bin/bash') 53 | s = s.decode().strip() 54 | if len(s) == 0: 55 | raise ValueError(f'No variants in region {region}') 56 | s = s .split('\n') 57 | variant_ids = [si.split('\t', 3)[-2] for si in s] 58 | field_ix = s[0].split('\t')[8].split(':').index(field) 59 | 60 | if field == 'GT': 61 | gt_map = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan, 62 | '0|0':0, '0|1':1, '1|0':1, '1|1':2, '.|.':np.nan} 63 | s = [[gt_map[i.split(':', field_ix+1)[field_ix]] for i in si.split('\t')[9:]] for si in s] 64 | else: 65 | s = [[i.split(':', field_ix+1)[field_ix] for i in si.split('\t')[9:]] for si in s] 66 | 67 | return pd.DataFrame(data=s, index=variant_ids, columns=get_sample_ids(vcf), dtype=np.float32) 68 | 69 | 70 | def load_eqtl(eqtl_file, gene_id, chrom=None): 71 | """Load full eQTL or ieQTL summary statistics for the specified gene""" 72 | if eqtl_file.endswith('parquet'): 73 | p = eqtl_file 74 | if chrom is not None: 75 | p = eqtl_file.replace(re.findall('chr\d+', eqtl_file)[0], chrom) 76 | cols = ['phenotype_id', 'variant_id', 'pval_gi', 'pval_nominal'] 77 | eqtl_df = pd.read_parquet(p, columns=cols) 78 | eqtl_df = eqtl_df[eqtl_df['phenotype_id'] == gene_id].set_index('variant_id').rename(columns={'pval_gi':'pval_nominal'}) 79 | else: 80 | s = subprocess.check_output(f'zcat {eqtl_file} | grep {gene_id}', shell=True).decode() 81 | eqtl_cols = ['gene_id', 'variant_id', 'tss_distance', 'ma_samples', 'ma_count', 'maf', 'pval_nominal', 'slope', 'slope_se'] 82 | eqtl_df = pd.read_csv(io.StringIO(s), sep='\t', header=None, names=eqtl_cols, index_col=1) 83 | eqtl_df['position'] = eqtl_df.index.map(lambda x: int(x.split('_')[1])) 84 | return eqtl_df 85 | 86 | 87 | def load_gwas(gwas_file, variant_ids): 88 | """Load GWAS summary statistics""" 89 | gwas_df = pd.read_csv(gwas_file, sep='\t', usecols=['panel_variant_id', 'position', 'pvalue', 'frequency', 'sample_size'], index_col=0) 90 | gwas_df = gwas_df.loc[gwas_df.index.isin(variant_ids)].rename(columns={'pvalue':'pval_nominal', 'frequency':'maf'}) 91 | gwas_df['maf'] = np.where(gwas_df['maf']<=0.5, gwas_df['maf'], 1-gwas_df['maf']) 92 | return gwas_df 93 | 94 | 95 | def compute_ld(genotype_df, variant_id): 96 | """Compute LD (r2)""" 97 | # return genotype_df.corrwith(genotype_df.loc[variant_id], axis=1, method='pearson')**2 98 | g0 = genotype_df - genotype_df.values.mean(1, keepdims=True) 99 | d = (g0**2).sum(1) * (g0.loc[variant_id]**2).sum() 100 | return (g0 * g0.loc[variant_id]).sum(1)**2 / d 101 | 102 | 103 | def get_ld(vcf, variant_id, phenotype_bed, window=200000): 104 | """Load genotypes and compute LD (r2)""" 105 | phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, nrows=0).drop(['#chr', 'start', 'end'], axis=1) 106 | chrom, pos, _, _, _ = variant_id.split('_') 107 | pos = int(pos) 108 | genotype_df = get_cis_genotypes(chrom, pos, vcf, window=window)[phenotype_df.columns] 109 | gt.impute_mean(genotype_df, verbose=False) 110 | r2_s = compute_ld(genotype_df, variant_id) 111 | return r2_s 112 | 113 | 114 | def get_rsid(id_lookup_table, variant_id): 115 | s = subprocess.check_output(f'zcat {id_lookup_table} | grep {variant_id}', shell=True).decode() 116 | rs_id = [i for i in s.strip().split('\t') if i.startswith('rs')] 117 | assert len(rs_id) == 1 118 | return rs_id[0] 119 | 120 | 121 | def compare_loci(pval_df1, pval_df2, r2_s, variant_id=None, rs_id=None, 122 | highlight_ids=None, colorbar=True, ah=2, aw=2): 123 | """plot similar to LocusCompare (Liu et al., Nat Genet, 2019)""" 124 | assert pval_df1.index.equals(pval_df2.index) 125 | 126 | dl = 0.75 127 | dr = 0.75 128 | db = 0.75 129 | dt = 0.25 130 | fw = dl + aw + dr 131 | fh = db + ah + dt 132 | 133 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 134 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 135 | 136 | # LocusZoom colors 137 | lz_colors = ["#7F7F7F", "#282973", "#8CCCF0", "#69BD45", "#F9A41A", "#ED1F24"] 138 | select_args = {'s':24, 'marker':'D', 'c':"#714A9D", 'edgecolor':'k', 'lw':0.25} 139 | highlight_args = {'s':24, 'marker':'D', 'edgecolor':'k', 'lw':0.25} 140 | cmap = mpl.colors.ListedColormap(lz_colors) 141 | bounds = np.append(-1, np.arange(0,1.2,0.2)) 142 | norm = mpl.colors.BoundaryNorm(bounds, cmap.N) 143 | 144 | if colorbar: 145 | s = 0.66 146 | cax = fig.add_axes([(dl+aw+0.2)/fw, (db+ah-1.25*s)/fh, s*0.25/fw, s*1.25/fh]) 147 | cb = mpl.colorbar.ColorbarBase(cax, cmap=cmap, 148 | norm=norm, 149 | boundaries=bounds[1:], # start at 0 150 | ticks=bounds[1:], 151 | spacing='proportional', 152 | orientation='vertical') 153 | cax.set_title('r$\mathregular{^2}$', fontsize=12) 154 | cax.set_ylim([0,1]) 155 | 156 | if rs_id is not None: 157 | t = rs_id 158 | elif variant_id is not None: # reformat variant ID 159 | t = variant_id.split('_b')[0].replace('_',':',1).replace('_','-') 160 | 161 | x = -np.log10(pval_df1['pval_nominal']) 162 | y = -np.log10(pval_df2['pval_nominal']) 163 | 164 | # sort variants by LD; plot high LD in front 165 | s = r2_s[x.index].sort_values().index 166 | ax.scatter(x[s], y[s], c=r2_s[s].replace(np.nan, -1), s=20, cmap=cmap, norm=norm, edgecolor='k', lw=0.25, clip_on=False) 167 | 168 | if highlight_ids is not None: 169 | ax.scatter(x[highlight_ids], y[highlight_ids], **highlight_args, clip_on=False) 170 | 171 | if variant_id is not None: 172 | x = -np.log10(pval_df1.loc[variant_id, 'pval_nominal']) 173 | y = -np.log10(pval_df2.loc[variant_id, 'pval_nominal']) 174 | ax.scatter(x, y, **select_args) 175 | txt = ax.annotate(t, (x, y), xytext=(-5,5), textcoords='offset points', ha='right') 176 | else: # annotate lead variants 177 | v = pval_df1['pval_nominal'].idxmin() 178 | x = -np.log10(pval_df1.loc[v, 'pval_nominal']) 179 | y = -np.log10(pval_df2.loc[v, 'pval_nominal']) 180 | t = v.split('_b')[0].replace('_',':',1).replace('_','-') 181 | # ax.scatter(x, y, **select_args) 182 | txt = ax.annotate(t, (x, y), xytext=(5,5), textcoords='offset points', ha='left') 183 | v = pval_df2['pval_nominal'].idxmin() 184 | x = -np.log10(pval_df1.loc[v, 'pval_nominal']) 185 | y = -np.log10(pval_df2.loc[v, 'pval_nominal']) 186 | t = v.split('_b')[0].replace('_',':',1).replace('_','-') 187 | # ax.scatter(x, y, **select_args) 188 | txt = ax.annotate(t, (x, y), xytext=(-5,5), textcoords='offset points', ha='right') 189 | 190 | ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=4, nbins=5)) 191 | ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=4, nbins=5)) 192 | 193 | ax.set_xlabel('-log$\mathregular{_{10}}$(p-value)', fontsize=12) 194 | ax.set_ylabel('-log$\mathregular{_{10}}$(p-value)', fontsize=12) 195 | 196 | ax.set_xlim([0, ax.get_xlim()[1]]) 197 | ax.set_ylim([0, ax.get_ylim()[1]]) 198 | ax.spines['left'].set_position(('outward', 6)) 199 | ax.spines['bottom'].set_position(('outward', 6)) 200 | ax.spines['top'].set_visible(False) 201 | ax.spines['right'].set_visible(False) 202 | 203 | return ax 204 | 205 | 206 | def plot_locus(pvals, variant_ids=None, gene=None, r2_s=None, rs_id=None, 207 | highlight_ids=None, credible_sets=None, show_lead=True, show_rsid=True, label_first_only=False, 208 | tracks=None, track_colors=None, shared_only=True, show_effect=False, 209 | xlim=None, ymax=None, miny=5, sharey=None, labels=None, label_fontsize=12, title=None, shade_range=None, shade_color='#cecece', 210 | label_pos='left', gene_label_pos=None, chr_label_pos='bottom', window=200000, colorbar=True, gene_scale=0.33, 211 | dl=0.75, aw=4, dr=0.75, db=0.5, ah=1.25, dt=0.25, ds=0.1, gh=0.2, th=1.5, ytext=5, 212 | single_ylabel=False, ylabel='-log$\mathregular{_{10}}$(p-value)', rasterized=False): 213 | """ 214 | pvals: pd.DataFrame, or list of pd.DataFrame. Must contain 'pval_nominal' and 'position' columns. 215 | variant_ids: 216 | gene: qtl.annotation.Gene, or list thereof 217 | tracks: 218 | track_colors: 219 | shared_only: only plot variants that are present in all inputs 220 | sharey: list of dataset indexes with shared ylim 221 | show_effect: indicate effect direction of lead variant with up/down arrow 222 | """ 223 | 224 | if isinstance(pvals, pd.DataFrame): 225 | pvals = [pvals] 226 | n = len(pvals) 227 | if not isinstance(gene, Iterable): 228 | gene = [gene] 229 | 230 | if variant_ids is None: 231 | variant_ids = [] 232 | for p in pvals: 233 | if 'pval_nominal' in p: 234 | if p['pval_nominal'].max() > 1: # assume -log10(P) 235 | variant_ids.append(p['pval_nominal'].idxmax()) 236 | else: 237 | variant_ids.append(p['pval_nominal'].idxmin()) 238 | elif 'pip' in p: 239 | variant_ids.append(p['pip'].idxmax()) 240 | else: 241 | variant_ids.append(None) 242 | elif isinstance(variant_ids, str): 243 | variant_ids = [variant_ids]*n 244 | 245 | i = [i for i,p in enumerate(pvals) if variant_ids[0] in p.index] 246 | if i: 247 | chrom, pos = pvals[i[0]].loc[variant_ids[0], ['chr', 'position']] 248 | pos = int(pos) 249 | else: 250 | raise ValueError(f"{variant_ids[0]} not found in any of the inputs.") 251 | 252 | # set up figure 253 | if chr_label_pos != 'bottom': 254 | db = 0.25 255 | dt = 0.5 256 | fw = dl + aw + dr 257 | fh = db + n*ah + (n-1)*ds + dt 258 | if gene[0] is not None: 259 | fh += gh 260 | else: 261 | gh = 0 262 | if tracks is not None: 263 | fh += th + ds 264 | fig = plt.figure(figsize=(fw,fh), facecolor='none') 265 | axes = [fig.add_axes([dl/fw, (fh-dt-ah)/fh, aw/fw, ah/fh])] 266 | plot.format_plot(axes[-1], y_offset=6) 267 | for i in range(1,n): 268 | axes.append(fig.add_axes([dl/fw, (fh-dt-ah-i*(ah+ds))/fh, aw/fw, ah/fh], sharex=axes[0], facecolor='none')) 269 | plot.format_plot(axes[-1], y_offset=6) 270 | if tracks is not None: 271 | tax = fig.add_axes([dl/fw, (fh-dt-n*(ah+ds)-th)/fh, aw/fw, th/fh], sharex=axes[0], facecolor='none') 272 | if gene[0] is not None: 273 | gax = fig.add_axes([dl/fw, (db)/fh, aw/fw, gh/fh], sharex=axes[0], facecolor='none', label='Gene') 274 | 275 | if xlim is None: 276 | xlim = np.array([pos-window, pos+window]) 277 | axes[0].set_xlim(xlim) 278 | axes[0].xaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=3, nbins=4)) 279 | 280 | # LocusZoom colors 281 | lz_colors = ["#7F7F7F", "#282973", "#8CCCF0", "#69BD45", "#F9A41A", "#ED1F24"] 282 | select_args = {'s':24, 'marker':'D', 'c':"#714A9D", 'edgecolor':'k', 'lw':0.25} 283 | highlight_args = {'s':24, 'marker':'D', 'edgecolor':'k', 'lw':0.25} 284 | cmap = mpl.colors.ListedColormap(lz_colors) 285 | bounds = np.append(-1, np.arange(0,1.2,0.2)) 286 | norm = mpl.colors.BoundaryNorm(bounds, cmap.N) 287 | 288 | if colorbar: 289 | s = 0.66 290 | cax = fig.add_axes([(dl+aw+0.1)/fw, (fh-dt-ah+(1-s)/2*ah)/fh, s*ah/5/fw, s*ah/fh]) 291 | cb = mpl.colorbar.ColorbarBase(cax, cmap=cmap, 292 | norm=norm, 293 | boundaries=bounds[1:], # start at 0 294 | ticks=bounds[1:], 295 | spacing='proportional', 296 | orientation='vertical') 297 | cax.set_ylim([0,1]) 298 | cax.set_title('r$\mathregular{^2}$', fontsize=12) 299 | 300 | # common set of variants 301 | common_ix = pvals[0].index 302 | for pval_df in pvals[1:]: 303 | common_ix = common_ix[common_ix.isin(pval_df.index)] 304 | 305 | # plot p-values 306 | ylabels = [] 307 | for k,(ax,variant_id,pval_df) in enumerate(zip(axes, variant_ids, pvals)): 308 | # select variants in window 309 | m = (pval_df['position'] >= xlim[0]) & (pval_df['position'] <= xlim[1]) 310 | if shared_only: 311 | m &= pval_df.index.isin(common_ix) 312 | window_df = pval_df.loc[m] 313 | x = window_df['position'] 314 | if 'pval_nominal' in pval_df: 315 | if pval_df['pval_nominal'].max() > 1: # assume values are already -log10(P) 316 | p = window_df['pval_nominal'] 317 | minp = pval_df.loc[variant_id, 'pval_nominal'] 318 | else: 319 | p = -np.log10(window_df['pval_nominal']) 320 | minp = -np.log10(pval_df.loc[variant_id, 'pval_nominal']) 321 | ylabels.append(ylabel) 322 | 323 | # sort variants by LD; plot high LD in front 324 | if r2_s is not None: 325 | s = r2_s[window_df.index].sort_values().index 326 | r2 = r2_s[s].replace(np.nan, -1) 327 | elif 'r2' in pval_df: 328 | s = pval_df.loc[window_df.index, 'r2'].sort_values(na_position='first').index 329 | r2 = pval_df.loc[s, 'r2'].replace(np.nan, -1) 330 | else: 331 | s = window_df.index 332 | r2 = pd.Series(-1, index=s) 333 | ax.scatter(x[s], p[s], c=r2, s=20, cmap=cmap, norm=norm, edgecolor='k', lw=0.25, rasterized=rasterized) 334 | 335 | elif 'pip' in pval_df: 336 | p = window_df['pip'] 337 | ylabels.append('PIP') 338 | minp = pval_df.loc[variant_id, 'pip'] 339 | if 'cs_id' in pval_df: 340 | pip_df = pval_df[pval_df['cs_id'].notnull()].copy() 341 | cs_ix, cs_id = pd.factorize(pip_df['cs_id']) 342 | if len(cs_id) < 10: 343 | cs_colors = sns.color_palette('Set1', desat=0.66).as_hex() 344 | else: 345 | cs_colors = sns.color_palette('tab20b', desat=1).as_hex() 346 | cs_cmap = mpl.colors.ListedColormap(cs_colors) 347 | cs_norm = mpl.colors.BoundaryNorm(np.arange(1, cs_cmap.N+1), cs_cmap.N) 348 | ax.scatter(pip_df['position'], pip_df['pip'], c=pip_df['cs_id'].map(pd.Series(range(len(cs_id)), index=cs_id)), 349 | s=22, ec='none', cmap=cs_cmap, norm=cs_norm, rasterized=rasterized, clip_on=False) 350 | else: 351 | raise NotImplementedError 352 | 353 | if credible_sets is not None: 354 | df = pval_df.loc[credible_sets[k]['variant_id']] 355 | ax.scatter(df['position'], -np.log10(df['pval_nominal']), c=credible_sets[k]['cs_id']/10, s=50) 356 | # credible_sets[k]['variant_id'] 357 | 358 | if highlight_ids is not None: # plot relative to lead variant 359 | if isinstance(highlight_ids, str): 360 | highlight_ids = [highlight_ids] 361 | highlight_df = pval_df.loc[highlight_ids].copy() 362 | highlight_df = highlight_df[~highlight_df.index.isin(variant_ids)] # drop lead variant 363 | ix = highlight_df.index 364 | if 'pip' not in pval_df: 365 | ax.scatter(x[ix], p[ix], c=r2[ix], cmap=cmap, norm=norm, **highlight_args) 366 | else: 367 | if 'cs_id' in pval_df: # only plot highlight IDs that are in CSs 368 | ix = highlight_df.index[highlight_df.index.isin(pip_df.index)] 369 | ax.scatter(x[ix], p[ix], c='goldenrod', **highlight_args) 370 | else: 371 | ax.scatter(x[ix], p[ix], c='goldenrod', **highlight_args) 372 | 373 | # plot lead/selected variant, add text label, etc. 374 | if show_lead: 375 | if variant_id in pval_df.index: 376 | minpos = pval_df.loc[variant_id, 'position'] 377 | else: 378 | minpos = None 379 | 380 | if 'pip' not in pval_df: 381 | ax.scatter(minpos, minp, **select_args) 382 | elif minpos is not None: # highlight lead variant for each CS 383 | pip_df2 = pip_df.loc[pip_df.groupby('cs_id').apply(lambda x: x['pip'].idxmax())] 384 | ax.scatter(pip_df2['position'], pip_df2['pip'], c=pip_df2['cs_id'].map(pd.Series(range(len(cs_id)), index=cs_id)).values, 385 | cmap=cs_cmap, norm=cs_norm, s=24, marker='D', ec='k', lw=0.25) 386 | 387 | if k == 0 or not label_first_only: 388 | for i,r in pip_df2.iterrows(): 389 | i = i.split('_b')[0].replace('_',':',1).replace('_','-') 390 | if (r['position']-xlim[0])/(xlim[1]-xlim[0]) < 0.55: # right 391 | txt = ax.annotate(i, (r['position'], r['pip']), xytext=(5,5), textcoords='offset points') 392 | else: 393 | txt = ax.annotate(i, (r['position'], r['pip']), xytext=(-5,5), ha='right', textcoords='offset points') 394 | txt.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1")) 395 | 396 | if rs_id is not None: 397 | if isinstance(rs_id, str): 398 | t = rs_id 399 | else: 400 | t = rs_id[k] 401 | else: 402 | t = variant_id.split('_b')[0].replace('_',':',1).replace('_','-') 403 | 404 | if show_rsid and minpos is not None and 'pip' not in pval_df and (k == 0 or not label_first_only): # text label 405 | if (minpos-xlim[0])/(xlim[1]-xlim[0]) < 0.55: # right 406 | txt = ax.annotate(t, (minpos, minp), xytext=(5,ytext), textcoords='offset points') 407 | else: 408 | txt = ax.annotate(t, (minpos, minp), xytext=(-5,ytext), ha='right', textcoords='offset points') 409 | txt.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1")) 410 | 411 | if show_effect: 412 | arrow_width = 0.025 413 | arrow_height = 0.3 414 | arrow_dr = 0.125 415 | arrow_dt = 0.2 416 | beta_col = [i for i in pval_df.columns if i in ('slope', 'beta', 'effect_size')] 417 | # assert len(beta_col) == 1, f"No effect size found" 418 | if len(beta_col) == 1: 419 | beta_col = beta_col[0] 420 | beta = pval_df.loc[variant_id, beta_col] 421 | if beta > 0: 422 | ax.arrow(1-arrow_dr/aw, 1-(arrow_height+arrow_dt)/ah, 0, arrow_height/ah, 423 | head_length=0.1/ah, width=arrow_width/aw, 424 | ec='none', fc='tab:green', transform=ax.transAxes) 425 | ax.text(1-arrow_dr*1.66/aw, 1-(arrow_height/2+arrow_dt)/ah, r"$\beta$", va='center', ha='center', transform=ax.transAxes) 426 | else: 427 | ax.arrow(1-arrow_dr/aw, 1-(arrow_dt-0.1)/ah, 0, -arrow_height/ah, 428 | head_length=0.1/ah, width=arrow_width/aw, 429 | ec='none', fc='tab:red', transform=ax.transAxes) 430 | ax.text(1-arrow_dr*1.66/aw, 1-(arrow_dt-0.1+arrow_height/2)/ah, r"$\beta$", va='center', ha='center', transform=ax.transAxes) 431 | 432 | ax.margins(y=0.2) 433 | if ymax is not None and isinstance(ymax, Iterable): 434 | if ymax[k] is not None: 435 | ax.set_ylim([0, ymax[k]]) 436 | else: 437 | ax.set_ylim([0, ax.get_ylim()[1]]) 438 | else: 439 | if 'pip' in pval_df: 440 | ax.set_ylim([0, ax.get_ylim()[1]]) 441 | elif ymax is None: 442 | ax.set_ylim([0, np.maximum(ax.get_ylim()[1], miny)]) 443 | else: 444 | ax.set_ylim([0, ymax]) 445 | 446 | if shade_range is not None: # highlight subregion with gray background 447 | ax.add_patch(patches.Rectangle((shade_range[0], 0), np.diff(shade_range)[0], ax.get_ylim()[1], facecolor=shade_color, zorder=-10)) 448 | 449 | if labels is not None: 450 | if label_pos == 'left': 451 | for ax,t in zip(axes, labels): 452 | ax.text(0.02, 0.925, t, transform=ax.transAxes, va='top', ha='left', fontsize=label_fontsize) 453 | elif label_pos == 'right': 454 | for ax,t in zip(axes, labels): 455 | ax.text(0.98, 0.925, t, transform=ax.transAxes, va='top', ha='right', fontsize=label_fontsize) 456 | 457 | if single_ylabel: 458 | # for ax in axes: 459 | # x.set_ylabel(None) 460 | m = db + (n*ah + (n-1)*ds)/2 461 | fig.text(0.035, m/fh, '-log$\mathregular{_{10}}$(p-value)', va='center', rotation=90, fontsize=14); 462 | else: 463 | for k,ax in enumerate(axes): 464 | ax.set_ylabel(ylabels[k], fontsize=12)#, labelpad=15) 465 | # if 'p-value' in ylabels[k]: 466 | # ax.yaxis.set_label_coords(-0.07*4/aw, 0.5) 467 | 468 | for ax in axes: 469 | ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=3, nbins=4)) 470 | axes[0].set_title(title, fontsize=12) 471 | 472 | if chr_label_pos == 'bottom': 473 | v = axes 474 | else: 475 | v = axes[1:] 476 | if tracks is not None: 477 | v += [tax] 478 | for ax in v: 479 | plt.setp(ax.get_xticklabels(), visible=False) 480 | for line in ax.xaxis.get_ticklines(): 481 | line.set_markersize(0) 482 | line.set_markeredgewidth(0) 483 | 484 | if sharey is not None: # force equal y limits 485 | shared_max = 0 486 | for k in sharey: 487 | y = axes[k-1].get_ylim()[1] 488 | if y > shared_max: 489 | shared_max = y 490 | for k in sharey: 491 | axes[k-1].set_ylim([0, shared_max]) 492 | 493 | if tracks is not None and len(tracks) > 0: # plot, e.g., ATAC-seq tracks 494 | ntracks = tracks.shape[1] 495 | x = tracks.index 496 | maxv = tracks.max().max() 497 | for k, label in enumerate(tracks): 498 | y0 = (ntracks-1-k) * np.ones(len(x)) # vertical offset 499 | if track_colors is not None and label in track_colors: 500 | color = track_colors[label] 501 | else: 502 | color = 'k' 503 | c = tracks[label] 504 | tax.fill_between(x, 0.95*c/maxv + y0, y0, 505 | antialiased=False, linewidth=1, facecolor=color, 506 | clip_on=True, rasterized=True) 507 | tax.set_yticks(np.arange(ntracks)) 508 | tax.set_yticklabels(tracks.columns[::-1], fontsize=9, va='bottom') 509 | for line in tax.yaxis.get_ticklines(): 510 | line.set_markersize(0) 511 | line.set_markeredgewidth(0) 512 | for i in ['top', 'bottom', 'right', 'left']: 513 | tax.spines[i].set_visible(False) 514 | tax.set_ylim([0, ntracks]) 515 | 516 | if gene[0] is None or gene[0].chr != chrom: 517 | axes[-1].xaxis.tick_bottom() 518 | axes[-1].xaxis.set_label_position('bottom') 519 | axes[-1].spines['bottom'].set_visible(True) 520 | axes[-1].tick_params(axis='x', pad=2) 521 | axes[-1].xaxis.labelpad = 8 522 | axes[-1].set_xlabel(f'Position on {chrom} (Mb)', fontsize=12) 523 | xt = axes[-1].get_xticks() 524 | axes[-1].set_xticks(xt) 525 | axes[-1].set_xticklabels(xt/1e6) 526 | axes[-1].set_xlim(xlim) 527 | else: # add gene model 528 | # plot gene model and annotate 529 | if gene[0].end_pos < xlim[0]: 530 | x = gh/aw/2 531 | v = np.array([[x,0.2], [x-0.8*gh/aw, 0.5], [x,0.8]]) 532 | polygon = patches.Polygon(v, closed=True, color='k', transform=gax.transAxes, clip_on=False) 533 | gax.add_patch(polygon) 534 | txt = f'{gene[0].name} (~{(pos-gene[0].tss)/1e6:.1f}Mb)' 535 | gax.set_ylim([-1,1]) 536 | gax.text(1.5*x, 0.5, txt, va='center', ha='left', transform=gax.transAxes) 537 | elif gene[0].start_pos > xlim[1]: 538 | x = 1 - gh/aw/2 539 | v = np.array([[x,0.2], [x+0.8*gh/aw, 0.5], [x,0.8]]) 540 | polygon = patches.Polygon(v, closed=True, color='k', transform=gax.transAxes, clip_on=False) 541 | gax.add_patch(polygon) 542 | txt = f'{gene[0].name} (~{(gene[0].tss-pos)/1e6:.1f}Mb)' 543 | gax.set_ylim([-1,1]) 544 | gax.text(1 - gh/aw/2*1.5, 0.5, txt, va='center', ha='right', transform=gax.transAxes) 545 | else: 546 | m = np.mean(xlim) 547 | for k,g in enumerate(gene): 548 | if g is not None: 549 | g = g.collapse() 550 | y = len(gene)-1-k # revert position 551 | g.plot(ax=gax, yoffset=y, max_intron=1e9, pc_color='k', nc_color='k', ec='none', wx=0.1, scale=gene_scale, ylabels=None, clip_on=True) 552 | if gene_label_pos is None: 553 | if g.tss - m > m - g.tss: 554 | gax.annotate(g.name, (np.minimum(g.end_pos, xlim[1]), y), xytext=(5,0), textcoords='offset points', va='center', ha='left') 555 | else: 556 | gax.annotate(g.name, (np.maximum(g.start_pos, xlim[0]), y), xytext=(-5,0), textcoords='offset points', va='center', ha='right') 557 | elif gene_label_pos == 'left': 558 | gax.annotate(g.name, (np.maximum(g.start_pos, xlim[0]), y), xytext=(-5,0), textcoords='offset points', va='center', ha='right') 559 | elif gene_label_pos == 'right': 560 | gax.annotate(g.name, (np.minimum(g.end_pos, xlim[1]), y), xytext=(5,0), textcoords='offset points', va='center', ha='left') 561 | gax.set_ylim([-0.5, len(gene)-0.5]) 562 | 563 | if chr_label_pos == 'bottom': 564 | gax.set_xlabel(f'Position on {chrom} (Mb)', fontsize=12) 565 | else: 566 | plt.setp(gax.get_xticklabels(), visible=False) 567 | for line in gax.xaxis.get_ticklines(): 568 | line.set_markersize(0) # tick length 569 | line.set_markeredgewidth(0) # tick line width 570 | gax.spines['bottom'].set_visible(False) 571 | 572 | gax.set_yticks([]) 573 | gax.set_yticklabels([]) 574 | gax.spines['top'].set_visible(False) 575 | gax.spines['left'].set_visible(False) 576 | gax.spines['right'].set_visible(False) 577 | gax.set_title('') 578 | xt = gax.get_xticks() 579 | gax.set_xticks(xt) 580 | gax.set_xticklabels(xt/1e6) 581 | gax.set_xlim(xlim) 582 | axes.append(gax) 583 | 584 | if chr_label_pos != 'bottom': 585 | axes[0].xaxis.tick_top() 586 | axes[0].xaxis.set_label_position('top') 587 | axes[0].set_xlabel(f'Position on {chrom} (Mb)', fontsize=12) 588 | axes[0].spines['top'].set_visible(True) 589 | axes[0].tick_params(axis='x', pad=2) 590 | axes[0].xaxis.labelpad = 8 591 | 592 | for ax in axes: 593 | ax.set_facecolor('none') 594 | 595 | # for i in range(len(pvals)): 596 | # axes[i].get_yaxis().set_label_coords(-0.12,0.5) 597 | 598 | return axes 599 | 600 | 601 | def plot_ieqtl_locus(eqtl_df, ieqtl_df, gwas_df, r2_s, gene_id, variant_id, annot, 602 | independent_df=None, rs_id=None, trait_name=None, pp4=None, window=200000, 603 | aw=4, ah=1.25): 604 | 605 | pvals = [ 606 | gwas_df.rename(columns={'pvalue':'pval_nominal'}), 607 | eqtl_df.loc[eqtl_df.index.isin(r2_s.index)], 608 | ieqtl_df.loc[ieqtl_df.index.isin(r2_s.index)] 609 | ] 610 | 611 | if trait_name is None: 612 | trait_name = 'GWAS' 613 | 614 | labels = [trait_name] 615 | if pp4 is None: 616 | labels.extend(['eQTL', 'ieQTL']) 617 | else: 618 | labels.extend([f'eQTL (PP4 = {pp4[0]:.2f})', f'ieQTL (PP4 = {pp4[1]:.2f})']) 619 | 620 | plot_locus(pvals, variant_ids=variant_id, r2_s=r2_s, gene=annot.gene_dict[gene_id], rs_id=rs_id, 621 | highlight_ids=None, aw=aw, ah=ah, 622 | labels=labels, shade_range=None, gene_label_pos='right', chr_label_pos='bottom', window=window) 623 | 624 | 625 | 626 | if __name__ == '__main__': 627 | mpl.use('Agg') 628 | 629 | parser = argparse.ArgumentParser(description='locus plot') 630 | parser.add_argument('--eqtl', required=True, help='QTL summary statistics file containing all pairwise associations') 631 | parser.add_argument('--ieqtl', required=True, help='iQTL summary statistics file containing all pairwise associations') 632 | parser.add_argument('--gwas', required=True, help='GWAS summary statistics file') 633 | parser.add_argument('--vcf', required=True, help='VCF file') 634 | parser.add_argument('--phenotype_bed', required=True, help='Phenotype BED file used for QTL mapping (required for parsing sample IDs)') 635 | parser.add_argument('--gene_id', required=True, help='Gene ID') 636 | parser.add_argument('--gtf', required=True, help='Gene annotation in GTF format') 637 | parser.add_argument('--variant_id', help='Variant ID') 638 | parser.add_argument('--phenotype_id', default=None, help='Select p-values for a specific phenotype, e.g., for sQTLs') 639 | parser.add_argument('--rs_id', help='') 640 | parser.add_argument('--id_lookup_table', help='Lookup table mapping variant IDs to rs IDs (rs ID must be in last column)') 641 | parser.add_argument('--window', default=200000, type=int, help='') 642 | parser.add_argument('--labels', nargs='+', default=None) 643 | parser.add_argument('--ymax', nargs='+', type=np.float64, default=None) 644 | parser.add_argument('--sharey', nargs='+', type=int, help='Use same y-axis for the specified plots (1-indexed, with top plot starting at 1.)', default=None) 645 | parser.add_argument('--top_variant', default='ieQTL', choices=['GWAS', 'eQTL', 'ieQTL']) 646 | parser.add_argument('--output_dir', default='.', type=str, help='') 647 | args = parser.parse_args() 648 | 649 | print('Loading gene annotation') 650 | annot = annotation.Annotation(args.gtf) 651 | gene = annot.gene_dict[args.gene_id] 652 | chrom = gene.chr 653 | 654 | if args.phenotype_id is not None: 655 | load_id = args.phenotype_id 656 | else: 657 | load_id = args.gene_id 658 | 659 | print('Loading eQTL summary statistics') 660 | eqtl_df = load_eqtl(args.eqtl, load_id, chrom) 661 | 662 | print('Loading ieQTL summary statistics') 663 | ieqtl_df = load_eqtl(args.ieqtl, load_id, chrom) 664 | if not np.all(ieqtl_df.index.isin(eqtl_df.index)): 665 | print('WARNING: ieQTL results contain variants not present in eQTL results') 666 | 667 | print('Loading GWAS summary statistics') 668 | gwas_df = load_gwas(args.gwas, eqtl_df.index) 669 | 670 | if args.variant_id is None: 671 | common_ix = ieqtl_df.index[ieqtl_df.index.isin(eqtl_df.index) & ieqtl_df.index.isin(gwas_df.index)] 672 | if args.top_variant == 'ieQTL': 673 | variant_id = ieqtl_df.loc[common_ix, 'pval_nominal'].idxmin() 674 | elif args.top_variant == 'eQTL': 675 | variant_id = eqtl_df.loc[common_ix, 'pval_nominal'].idxmin() 676 | else: 677 | variant_id = gwas_df.loc[common_ix, 'pval_nominal'].idxmin() 678 | else: 679 | variant_id = args.variant_id 680 | chrom, pos, ref, alt, _ = variant_id.split('_') 681 | pos = int(pos) 682 | 683 | print('Loading genotypes and computing LD') 684 | r2_s = get_ld(args.vcf, variant_id, args.phenotype_bed) 685 | 686 | rs_id = args.rs_id 687 | if rs_id is None and args.id_lookup_table is not None: 688 | print('Parsing rsID lookup table') 689 | rs_id = get_rsid(args.id_lookup_table, variant_id) 690 | 691 | print('Generating plot') 692 | plot_locus([gwas_df, eqtl_df, ieqtl_df], args.gene_id, variant_id, annot, r2_s=r2_s, 693 | rs_id=rs_id, labels=[i.encode('utf-8').decode('unicode_escape') for i in args.labels], 694 | ymax=args.ymax, sharey=args.sharey, 695 | window=args.window, shared_only=True) 696 | 697 | pdf_file = os.path.join(args.output_dir, f'{gene.name}.{variant_id}.locus_plot.pdf') 698 | plt.savefig(pdf_file) 699 | 700 | print('Done.') 701 | -------------------------------------------------------------------------------- /qtl/map.py: -------------------------------------------------------------------------------- 1 | """qtl.map: functions for mapping QTLs""" 2 | 3 | __author__ = "Francois Aguet" 4 | __copyright__ = "Copyright 2018-2020, The Broad Institute" 5 | __license__ = "BSD3" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import scipy.stats 10 | import re 11 | from . import stats 12 | from . import genotype as gt 13 | from . import locusplot 14 | try: 15 | import rpy2 16 | has_rpy2 = True 17 | except: 18 | has_rpy2 = False 19 | 20 | 21 | def calculate_association(genotype, phenotype_s, covariates_df=None, impute=True, logp=False): 22 | """Compute genotype-phenotype associations""" 23 | if logp and not has_rpy2: 24 | raise ValueError("The rpy2 package is required to compute log p-values.") 25 | if isinstance(genotype, pd.Series): 26 | genotype_df = genotype.to_frame().T 27 | elif isinstance(genotype, pd.DataFrame): 28 | genotype_df = genotype 29 | else: 30 | raise ValueError('Input type not supported') 31 | 32 | # assert np.all(genotype_df.columns==phenotype_s.index) 33 | if covariates_df is not None: 34 | assert covariates_df.index.equals(genotype_df.columns) 35 | 36 | # impute missing genotypes 37 | if impute: 38 | gt.impute_mean(genotype_df, verbose=False) 39 | 40 | # residualize genotypes and phenotype 41 | if covariates_df is not None: 42 | r = stats.Residualizer(covariates_df) 43 | gt_res_df = r.transform(genotype_df) 44 | p_res_s = r.transform(phenotype_s) 45 | num_covar = covariates_df.shape[1] 46 | else: 47 | gt_res_df = genotype_df 48 | p_res_s = phenotype_s 49 | num_covar = 0 50 | 51 | if isinstance(p_res_s, pd.Series): 52 | n = p_res_s.std() / gt_res_df.std(axis=1) 53 | else: 54 | n = p_res_s.std(axis=1) / gt_res_df.std(axis=1).values 55 | 56 | gt_res_df = stats.center_normalize(gt_res_df, axis=1) 57 | if isinstance(p_res_s, pd.Series): 58 | p_res_s = stats.center_normalize(p_res_s) 59 | else: 60 | p_res_s = stats.center_normalize(p_res_s, axis=1) 61 | 62 | if isinstance(p_res_s, pd.Series): 63 | r = gt_res_df.dot(p_res_s) 64 | else: # single genotype x phenotypes 65 | r = gt_res_df.dot(p_res_s.T).squeeze() 66 | dof = gt_res_df.shape[1] - 2 - num_covar 67 | tstat = r * np.sqrt(dof/(1-r*r)) 68 | 69 | if not logp: 70 | pval = 2*scipy.stats.t.cdf(-np.abs(tstat), dof) 71 | else: 72 | r_pt = rpy2.robjects.r['pt'] 73 | rt = rpy2.robjects.vectors.FloatVector(-np.abs(tstat)) 74 | pval = -(np.array(r_pt(rt, dof, lower_tail=True, log=True)) + np.log(2)) * np.log10(np.e) 75 | 76 | df = pd.DataFrame(pval, index=tstat.index, columns=['pval_nominal']) 77 | df['slope'] = r * n 78 | df['slope_se'] = df['slope'] / tstat 79 | df['corr_r2'] = r*r 80 | df['tstat'] = tstat 81 | n2 = 2 * genotype_df.shape[1] 82 | af = genotype_df.sum(1) / n2 83 | if isinstance(p_res_s, pd.Series): 84 | df['af'] = af 85 | else: 86 | assert len(af) == 1 87 | df['af'] = af.values[0] 88 | ix = df['af'] <= 0.5 89 | m = genotype_df > 0.5 90 | a = m.sum(1).astype(int) 91 | b = (genotype_df < 1.5).sum(1).astype(int) 92 | df['ma_samples'] = np.where(ix, a, b) 93 | a = (genotype_df * m).sum(1).round().astype(int) # round for missing/imputed genotypes 94 | df['ma_count'] = np.where(ix, a, n2-a) 95 | if isinstance(genotype, pd.DataFrame): 96 | if logp: 97 | df['r2'] = locusplot.compute_ld(genotype, df['pval_nominal'].idxmax()) 98 | else: 99 | df['r2'] = locusplot.compute_ld(genotype, df['pval_nominal'].idxmin()) 100 | 101 | # if isinstance(df.index[0], str) and '_' in df.index[0]: # assume variant IDs in format chr_pos_ref_alt_build 102 | if isinstance(df.index[0], str) and len(re.findall("^(?:chr)?\w_?\d+_", df.index[0])) == 1: 103 | df['chr'] = df.index.map(lambda x: x.split('_')[0]) 104 | df['position'] = df.index.map(lambda x: int(x.split('_')[1])) 105 | if isinstance(p_res_s, pd.Series): 106 | df.index.name = 'variant_id' 107 | else: 108 | df.index.name = 'phenotype_id' 109 | m = df['pval_nominal'] == 0 110 | if any(m): 111 | e = np.nextafter(0, 1) # np.finfo(np.float64).tiny 112 | print(f"Warning: underflow detected (setting to {e}), use logp=True to compute p-values as -log10(P).") 113 | df.loc[m, 'pval_nominal'] = e 114 | return df 115 | 116 | 117 | def map_pairs(genotype_df, phenotype_df, covariates_df=None, impute=True): 118 | """Calculates association statistics for arbitrary phenotype-variant pairs""" 119 | assert genotype_df.shape[0] == phenotype_df.shape[0] 120 | assert genotype_df.columns.equals(phenotype_df.columns) 121 | assert genotype_df.columns.equals(covariates_df.index) 122 | if impute: 123 | gt.impute_mean(genotype_df, verbose=False) 124 | 125 | # residualize genotypes and phenotype 126 | if covariates_df is not None: 127 | r = stats.Residualizer(covariates_df) 128 | gt_res_df = r.transform(genotype_df) 129 | p_res_df = r.transform(phenotype_df) 130 | num_covar = covariates_df.shape[1] 131 | else: 132 | gt_res_df = genotype_df 133 | p_res_df = phenotype_df 134 | num_covar = 0 135 | 136 | n = p_res_df.std(axis=1).values / gt_res_df.std(axis=1).values 137 | 138 | gt_res_df = stats.center_normalize(gt_res_df, axis=1) 139 | p_res_df = stats.center_normalize(p_res_df, axis=1) 140 | 141 | r = np.sum(gt_res_df.values * p_res_df.values, axis=1) 142 | dof = gt_res_df.shape[1] - 2 - num_covar 143 | 144 | tstat2 = dof*r*r / (1-r*r) 145 | pval = scipy.stats.f.sf(tstat2, 1, dof) 146 | 147 | df = pd.DataFrame({'phenotype_id':phenotype_df.index, 'variant_id':genotype_df.index, 'pval_nominal':pval}) 148 | df['slope'] = r * n 149 | df['slope_se'] = df['slope'].abs() / np.sqrt(tstat2) 150 | df['af'] = genotype_df.sum(1).values / (2*genotype_df.shape[1]) 151 | df['maf'] = np.where(df['af'] <= 0.5, df['af'], 1-df['af']) 152 | return df 153 | 154 | 155 | def calculate_interaction(genotype_s, phenotype_s, interaction_s, covariates_df=None, impute=True): 156 | 157 | assert genotype_s.index.equals(interaction_s.index) 158 | 159 | # impute missing genotypes 160 | if impute: 161 | gt.impute_mean(genotype_s, verbose=False) 162 | 163 | # interaction term 164 | gi = genotype_s * interaction_s 165 | 166 | # center 167 | g0 = genotype_s - genotype_s.mean() 168 | gi0 = gi - gi.mean() 169 | i0 = interaction_s - interaction_s.mean() 170 | p0 = phenotype_s - phenotype_s.mean() 171 | 172 | dof = phenotype_s.shape[0] - 4 173 | # residualize 174 | if covariates_df is not None: 175 | r = stats.Residualizer(covariates_df) 176 | g0 = r.transform(g0.values.reshape(1,-1), center=False) 177 | gi0 = r.transform(gi0.values.reshape(1,-1), center=False) 178 | p0 = r.transform(p0.values.reshape(1,-1), center=False) 179 | i0 = r.transform(i0.values.reshape(1,-1), center=False) 180 | dof -= covariates_df.shape[1] 181 | else: 182 | g0 = g0.values.reshape(1,-1) 183 | gi0 = gi0.values.reshape(1,-1) 184 | p0 = p0.values.reshape(1,-1) 185 | i0 = i0.values.reshape(1,-1) 186 | 187 | # regression 188 | X = np.r_[g0, i0, gi0].T 189 | Xinv = np.linalg.inv(np.dot(X.T, X)) 190 | b = np.dot(np.dot(Xinv, X.T), p0.reshape(-1,1)) 191 | r = np.squeeze(np.dot(X, b)) - p0 192 | rss = np.sum(r*r) 193 | b_se = np.sqrt(np.diag(Xinv) * rss / dof) 194 | b = np.squeeze(b) 195 | tstat = b / b_se 196 | pval = 2*scipy.stats.t.cdf(-np.abs(tstat), dof) 197 | 198 | return pd.Series({ 199 | 'b_g':b[0], 'b_g_se':b_se[0], 'pval_g':pval[0], 200 | 'b_i':b[1], 'b_i_se':b_se[1], 'pval_i':pval[1], 201 | 'b_gi':b[2],'b_gi_se':b_se[2],'pval_gi':pval[2], 202 | })#, r[0] 203 | 204 | 205 | def compute_ld(genotype_df, variant_id): 206 | """Compute LD (r2)""" 207 | # return gt_df.corrwith(gt_df.loc[variant_id], axis=1, method='pearson')**2 208 | g0 = genotype_df - genotype_df.values.mean(1, keepdims=True) 209 | d = (g0**2).sum(1) * (g0.loc[variant_id]**2).sum() 210 | return (g0 * g0.loc[variant_id]).sum(1)**2 / d 211 | 212 | 213 | def get_conditional_pvalues(group_df, genotypes, phenotype_df, covariates_df, 214 | phenotype_id=None, window=200000, maf_threshold=0): 215 | """ 216 | Compute conditional p-values for a set of variants defined in group_df 217 | 218 | group_df : dataframe with columns 'variant_id' and 'phenotype_id' 219 | genotypes : pd.DataFrame or qtl.genotype.GenotypeIndexer 220 | phenotype_df : pd.DataFrame 221 | covariates_df : pd.DataFrame 222 | """ 223 | assert phenotype_df.columns.equals(covariates_df.index) 224 | variant_id = group_df['variant_id'].iloc[0] 225 | 226 | if isinstance(genotypes, gt.GenotypeIndexer): 227 | gt_df = genotypes.get_genotype_window(variant_id, window=window) 228 | elif isinstance(genotypes, pd.DataFrame): 229 | gt_df = genotypes 230 | else: 231 | raise ValueError('Unsupported input format') 232 | 233 | maf = gt_df.sum(1) / (2*gt_df.shape[1]) 234 | maf = np.where(maf<=0.5, maf, 1-maf) 235 | 236 | gt_df = gt_df[maf >= maf_threshold] 237 | 238 | res = [] 239 | if phenotype_id is not None: 240 | pval_df = calculate_association(gt_df, phenotype_df.loc[phenotype_id], covariates_df=covariates_df) 241 | pval_df['r2'] = compute_ld(gt_df, variant_id) 242 | res.append(pval_df) 243 | 244 | for k,(variant_id, phenotype_id) in enumerate(zip(group_df['variant_id'], group_df['phenotype_id']), 1): 245 | print(f'\rProcessing {k}/{group_df.shape[0]}', end='') 246 | covariates = pd.concat([covariates_df, gt_df.loc[np.setdiff1d(group_df['variant_id'], variant_id)].T], axis=1) 247 | pval_df = calculate_association(gt_df, phenotype_df.loc[phenotype_id], covariates_df=covariates) 248 | pval_df['r2'] = compute_ld(gt_df, variant_id) 249 | 250 | res.append(pval_df) 251 | return res 252 | -------------------------------------------------------------------------------- /qtl/norm.py: -------------------------------------------------------------------------------- 1 | # Author: Francois Aguet 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.stats as stats 5 | import warnings 6 | 7 | 8 | #-------------------------------------- 9 | # eQTL expression normalization 10 | #-------------------------------------- 11 | def normalize_quantiles(df): 12 | """ 13 | Quantile normalization to the average empirical distribution 14 | Note: replicates behavior of R function normalize.quantiles 15 | from library("preprocessCore") 16 | 17 | Reference: 18 | [1] Bolstad et al., Bioinformatics 19(2), pp. 185-193, 2003 19 | 20 | Adapted from https://github.com/andrewdyates/quantile_normalize 21 | """ 22 | M = df.values.copy() 23 | 24 | Q = M.argsort(axis=0) 25 | m,n = M.shape 26 | 27 | # compute quantile vector 28 | quantiles = np.zeros(m) 29 | for i in range(n): 30 | quantiles += M[Q[:,i],i] 31 | quantiles = quantiles / n 32 | 33 | for i in range(n): 34 | # Get equivalence classes; unique values == 0 35 | dupes = np.zeros(m, dtype=np.int64) 36 | for j in range(m-1): 37 | if M[Q[j,i],i] == M[Q[j+1,i],i]: 38 | dupes[j+1] = dupes[j]+1 39 | 40 | # Replace column with quantile ranks 41 | M[Q[:,i],i] = quantiles 42 | 43 | # Average together equivalence classes 44 | j = m-1 45 | while j >= 0: 46 | if dupes[j] == 0: 47 | j -= 1 48 | else: 49 | idxs = Q[j-dupes[j]:j+1,i] 50 | M[idxs,i] = np.median(M[idxs,i]) 51 | j -= 1 + dupes[j] 52 | assert j == -1 53 | 54 | return pd.DataFrame(M, index=df.index, columns=df.columns) 55 | 56 | 57 | def inverse_normal_transform(M): 58 | """Transform rows to a standard normal distribution""" 59 | if isinstance(M, pd.Series): 60 | r = stats.rankdata(M) 61 | return pd.Series(stats.norm.ppf(r/(M.shape[0]+1)), index=M.index, name=M.name) 62 | else: 63 | R = stats.rankdata(M, axis=1) # ties are averaged 64 | Q = stats.norm.ppf(R/(M.shape[1]+1)) 65 | if isinstance(M, pd.DataFrame): 66 | Q = pd.DataFrame(Q, index=M.index, columns=M.columns) 67 | return Q 68 | 69 | #-------------------------------------- 70 | # DESeq size factor normalization 71 | #-------------------------------------- 72 | def deseq2_size_factors(counts_df): 73 | """ 74 | Calculate DESeq size factors 75 | median of ratio to reference sample (geometric mean of all samples) 76 | 77 | References: 78 | [1] Anders & Huber, 2010 79 | [2] R functions: 80 | DESeq::estimateSizeFactorsForMatrix 81 | """ 82 | idx = np.all(counts_df>0, axis=1) 83 | tmp_df = np.log(counts_df.loc[idx.values]) 84 | s = np.exp(np.median(tmp_df.T - np.mean(tmp_df, axis=1), axis=1)) 85 | return s 86 | 87 | 88 | def deseq2_normalized_counts(counts_df): 89 | """ 90 | Equivalent to DESeq2:::counts.DESeqDataSet; counts(x, normalized=T) 91 | """ 92 | return counts_df / deseq2_size_factors(counts_df) 93 | 94 | 95 | def deseq2_cpm(counts_df): 96 | """Calculate CPM normalized by DESeq size factors""" 97 | cpm_df = counts_df/counts_df.sum(axis=0)*1e6 98 | s = deseq2_size_factors(cpm_df) 99 | return cpm_df / s 100 | 101 | #-------------------------------------- 102 | # edgeR TMM normalization 103 | #-------------------------------------- 104 | def edger_calcnormfactors(counts_df, ref=None, logratio_trim=0.3, 105 | sum_trim=0.05, acutoff=-1e10, verbose=False): 106 | """ 107 | Calculate TMM (Trimmed Mean of M values) normalization. 108 | Reproduces edgeR::calcNormFactors.default 109 | 110 | Scaling factors for the library sizes that minimize 111 | the log-fold changes between the samples for most genes. 112 | 113 | Effective library size: TMM scaling factor * library size 114 | 115 | References: 116 | [1] Robinson & Oshlack, 2010 117 | [2] R functions: 118 | edgeR::calcNormFactors.default 119 | edgeR:::.calcFactorWeighted 120 | edgeR:::.calcFactorQuantile 121 | """ 122 | 123 | # discard genes with all-zero counts 124 | Y = counts_df.values.copy() 125 | allzero = np.sum(Y>0,axis=1)==0 126 | if np.any(allzero): 127 | Y = Y[~allzero,:] 128 | 129 | # select reference sample 130 | if ref is None: # reference sample index 131 | f75 = np.percentile(Y/np.sum(Y,axis=0), 75, axis=0) 132 | ref = np.argmin(np.abs(f75-np.mean(f75))) 133 | if verbose: 134 | print('Reference sample index: '+str(ref)) 135 | 136 | N = np.sum(Y, axis=0) # total reads in each library 137 | 138 | # with np.errstate(divide='ignore'): 139 | with warnings.catch_warnings(): 140 | warnings.simplefilter('ignore') 141 | # log fold change; Mg in [1] 142 | logR = np.log2((Y/N).T / (Y[:,ref]/N[ref])).T 143 | # average log relative expression; Ag in [1] 144 | absE = 0.5*(np.log2(Y/N).T + np.log2(Y[:,ref]/N[ref])).T 145 | v = (N-Y)/N/Y 146 | v = (v.T + v[:,ref]).T # w in [1] 147 | 148 | ns = Y.shape[1] 149 | tmm = np.zeros(ns) 150 | for i in range(ns): 151 | fin = np.isfinite(logR[:,i]) & np.isfinite(absE[:,i]) & (absE[:,i] > acutoff) 152 | n = np.sum(fin) 153 | 154 | loL = np.floor(n*logratio_trim)+1 155 | hiL = n + 1 - loL 156 | loS = np.floor(n*sum_trim)+1 157 | hiS = n + 1 - loS 158 | rankR = stats.rankdata(logR[fin,i]) 159 | rankE = stats.rankdata(absE[fin,i]) 160 | keep = (rankR >= loL) & (rankR <= hiL) & (rankE >= loS) & (rankE <= hiS) 161 | # in [1], w erroneously defined as 1/v ? 162 | tmm[i] = 2**(np.nansum(logR[fin,i][keep]/v[fin,i][keep]) / np.nansum(1/v[fin,i][keep])) 163 | 164 | tmm = tmm / np.exp(np.mean(np.log(tmm))) 165 | return tmm 166 | 167 | 168 | def edger_cpm_default(counts_df, lib_size=None, log=False, prior_count=0.25): 169 | """ 170 | edgeR normalized counts 171 | 172 | Reproduces edgeR::cpm.default 173 | """ 174 | if lib_size is None: 175 | lib_size = counts_df.sum(axis=0) 176 | if log: 177 | prior_count_scaled = lib_size/np.mean(lib_size) * prior_count 178 | lib_size <- lib_size + 2 * prior_count_scaled 179 | lib_size = 1e-6 * lib_size 180 | if log: 181 | return np.log2((counts_df + prior_count_scaled)/lib.size) 182 | else: 183 | return counts_df / lib_size 184 | 185 | 186 | def edger_cpm(counts_df, tmm=None, normalized_lib_sizes=True): 187 | """ 188 | Return edgeR normalized/rescaled CPM (counts per million) 189 | 190 | Reproduces edgeR::cpm.DGEList 191 | """ 192 | lib_size = counts_df.sum(axis=0) 193 | if normalized_lib_sizes: 194 | if tmm is None: 195 | tmm = edger_calcnormfactors(counts_df) 196 | lib_size = lib_size * tmm 197 | return counts_df / lib_size * 1e6 198 | 199 | #-------------------------------------- 200 | # limma-voom functions 201 | #-------------------------------------- 202 | def voom_transform(counts_df): 203 | """Apply counts transformation from limma-voom""" 204 | lib_size = counts_df.sum(0) 205 | norm_factors = edger_calcnormfactors(counts_df) 206 | return np.log2((counts_df + 0.5) / (lib_size*norm_factors + 1) * 1e6) 207 | 208 | #-------------------------------------- 209 | # PoissonSeq size factor normalization 210 | #-------------------------------------- 211 | def poissonseq_size_factors(counts_df, maxiter=10): 212 | """ 213 | PoissonSeq normalization from Li et al., Biostatistics, 2012 214 | """ 215 | gsum = counts_df.sum(1) 216 | 217 | # initialize 218 | ix = counts_df.index 219 | libsize = counts_df.sum(0) 220 | d_est = libsize / libsize.sum() 221 | 222 | # v = [d_est] 223 | i = 0 224 | meandiff = 1 225 | while i1e-10: 226 | d = np.outer(gsum, d_est) 227 | gof = ((counts_df - d).pow(2) / d).sum(1) 228 | lb, ub = np.percentile(gof, [25,75]) 229 | ix = gof[(lb<=gof) & (gof<=ub)].index 230 | d_est0 = d_est 231 | d_est = counts_df.loc[ix].sum(0) / gsum.loc[ix].sum() 232 | meandiff = (d_est - d_est0).pow(2).sum() / counts_df.shape[1] 233 | i += 1 234 | # print(meandiff) 235 | # v.append(d_est) 236 | return d_est 237 | -------------------------------------------------------------------------------- /qtl/pca.py: -------------------------------------------------------------------------------- 1 | """qtl.pca: helper functions for PCA of expression data""" 2 | 3 | __author__ = "Francois Aguet" 4 | __copyright__ = "Copyright 2018-2020, The Broad Institute" 5 | __license__ = "BSD3" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import sklearn.decomposition 10 | 11 | from . import norm 12 | from . import stats 13 | 14 | 15 | def normalize_counts(gct_df, C=None, threshold=10, threshold_frac=0.1): 16 | """ 17 | Normalize (size factors), threshold, residualize, center, unit norm 18 | 19 | gct_df: read counts or TPMs 20 | C: covariates matrix 21 | """ 22 | 23 | gct_norm_df = gct_df.copy() / norm.deseq2_size_factors(gct_df) 24 | for x in gct_norm_df.values: 25 | m = x == 0 26 | if not all(m): 27 | x[m] = np.min(x[~m])/2 28 | 29 | # threshold low expressed genes: >=10 counts in >10% of samples (default) 30 | mask = np.mean(gct_norm_df >= threshold, axis=1) > threshold_frac 31 | gct_norm_df = np.log10(gct_norm_df[mask]) 32 | 33 | if C is not None: 34 | gct_norm_df = stats.residualize(gct_norm_df, C, center=False) 35 | 36 | gct_norm_std_df = stats.center_normalize(gct_norm_df) 37 | return gct_norm_std_df 38 | 39 | 40 | def get_pcs(gct_df, normalize=True, C=None, n_components=5, return_loadings=False, random_state=None): 41 | """ 42 | Scale input GCT, threshold, normalize and calculate PCs 43 | """ 44 | if normalize: 45 | gct_norm_std_df = normalize_counts(gct_df, C=C) 46 | else: 47 | gct_norm_std_df = gct_df 48 | 49 | pca = sklearn.decomposition.PCA(n_components=n_components, svd_solver='auto', random_state=random_state) 50 | pca.fit(gct_norm_std_df.T) 51 | P = pca.transform(gct_norm_std_df.T) 52 | pc_df = pd.DataFrame(P, index=gct_norm_std_df.columns, 53 | columns=[f'PC{i}' for i in range(1, P.shape[1]+1)]) 54 | pve_s = pd.Series(pca.explained_variance_ratio_ * 100, index=pc_df.columns, name='pve') 55 | if not return_loadings: 56 | return pc_df, pve_s 57 | else: 58 | loadings_df = pd.DataFrame(pca.components_.T, index=gct_norm_std_df.index, columns=pc_df.columns) 59 | return pc_df, pve_s, loadings_df 60 | -------------------------------------------------------------------------------- /qtl/pileup.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import glob 4 | import os 5 | import subprocess 6 | import contextlib 7 | import tempfile 8 | from collections.abc import Iterable 9 | import multiprocessing as mp 10 | import matplotlib.pyplot as plt 11 | import matplotlib.patches as patches 12 | from matplotlib.colors import hsv_to_rgb, rgb2hex 13 | import seaborn as sns 14 | from cycler import cycler 15 | import pyBigWig 16 | 17 | from . import stats, annotation 18 | from . import plot as qtl_plot 19 | from . import genotype as gt 20 | from . import core 21 | 22 | 23 | @contextlib.contextmanager 24 | def cd(cd_path): 25 | if cd_path is not None: 26 | saved_path = os.getcwd() 27 | os.chdir(cd_path) 28 | yield 29 | os.chdir(saved_path) 30 | else: 31 | yield 32 | 33 | 34 | def _samtools_depth_wrapper(args): 35 | """ 36 | Wrapper for `samtools depth`. 37 | 38 | For files on GCP, GCS_OAUTH_TOKEN must be set. 39 | This can be done with qtl.refresh_gcs_token(). 40 | """ 41 | bam_file, region_str, sample_id, bam_index_dir, flags, user_project = args 42 | 43 | cmd = f"samtools depth {flags} -r {region_str} {bam_file}" 44 | if user_project is not None: 45 | cmd += f"?userProject={user_project}" 46 | with cd(bam_index_dir): 47 | c = subprocess.check_output(cmd, shell=True).decode().strip().split('\n') 48 | 49 | df = pd.DataFrame([i.split('\t') for i in c], columns=['chr', 'pos', sample_id]) 50 | df.index = df['pos'].astype(np.int32) 51 | return df[sample_id].astype(np.int32) 52 | 53 | 54 | def samtools_depth(region_str, bam_s, bam_index_dir=None, flags='-aa -Q 255 -d 100000', 55 | num_threads=12, user_project=None, verbose=True): 56 | """ 57 | Run samtools depth for a list of BAMs. 58 | 59 | Note: reads with the flags [UNMAP,SECONDARY,QCFAIL,DUP] are excluded by default; 60 | see documentation for `samtools depth` and http://www.htslib.org/doc/samtools-flags.html 61 | 62 | Parameters 63 | ---------- 64 | region_str : str 65 | Genomic region as 'chr:start-end' 66 | bam_s : pd.Series or dict 67 | sample_id -> bam_path 68 | bam_index_dir: str 69 | Directory already containing local copies of the BAM/CRAM indexes, or target directory 70 | flags : str 71 | Flags passed to samtools depth 72 | num_threads : int 73 | Number of threads 74 | user_project : str 75 | User project for GCP 76 | 77 | Returns 78 | ------- 79 | pileups_df : pd.DataFrame 80 | DataFrame of pileups (samples in columns) 81 | """ 82 | pileups_df = [] 83 | with mp.Pool(processes=num_threads) as pool: 84 | for k,r in enumerate(pool.imap(_samtools_depth_wrapper, [(i,region_str,j,bam_index_dir,flags,user_project) for j,i in bam_s.items()]), 1): 85 | if verbose: 86 | print(f'\r * running samtools depth on region {region_str} for bam {k}/{len(bam_s)}', end='' if k < len(bam_s) else None) 87 | pileups_df.append(r) 88 | pileups_df = pd.concat(pileups_df, axis=1) 89 | pileups_df.index.name = 'position' 90 | return pileups_df 91 | 92 | 93 | def read_regtools_junctions(junctions_file, convert_positions=True): 94 | """ 95 | Read output from regtools junctions extract and 96 | convert start/end positions to intron starts/ends. 97 | """ 98 | junctions_df = pd.read_csv(junctions_file, sep='\t', header=None, 99 | usecols=[0, 1, 2, 4, 5, 10], 100 | names=['chrom', 'start', 'end', 'count', 'strand', 'block_sizes']) 101 | if convert_positions: 102 | junctions_df['start'] += junctions_df['block_sizes'].apply(lambda x: int(x.split(',')[0])) + 1 103 | junctions_df['end'] -= junctions_df['block_sizes'].apply(lambda x: int(x.split(',')[1])) 104 | junctions_df.index = (junctions_df['chrom'] + ':' + junctions_df['start'].astype(str) 105 | + '-' + junctions_df['end'].astype(str) + ':' + junctions_df['strand']) 106 | return junctions_df 107 | 108 | 109 | def regtools_wrapper(args): 110 | """ 111 | Wrapper for regtools junctions extract. 112 | Filters out secondary and supplementary alignments 113 | """ 114 | bam_file, region_str, sample_id, bam_index_dir, strand, user_project = args 115 | with tempfile.TemporaryDirectory() as tempdir: 116 | filtered_bam = os.path.join(tempdir, 'filtered.bam') 117 | cmd = f"samtools view -b -F 2304 {bam_file}" 118 | if user_project is not None: 119 | cmd += f"?userProject={user_project}" 120 | cmd += f" {region_str} > {filtered_bam}" 121 | with cd(bam_index_dir): 122 | subprocess.check_call(cmd, shell=True) 123 | subprocess.check_call(f"samtools index {filtered_bam}", shell=True) 124 | junctions_file = os.path.join(tempdir, 'junctions.txt.gz') 125 | cmd = f"regtools junctions extract \ 126 | -a 8 -m 50 -M 500000 -s {strand} \ 127 | {filtered_bam} | gzip -c > {junctions_file}" 128 | subprocess.check_call(cmd, shell=True, stderr=subprocess.DEVNULL) 129 | junctions_df = read_regtools_junctions(junctions_file, convert_positions=True) 130 | junctions_df.index.name = sample_id 131 | return junctions_df 132 | 133 | 134 | def regtools_extract_junctions(region_str, bam_s, bam_index_dir=None, strand=0, num_threads=12, 135 | user_project=None, verbose=True): 136 | """ 137 | region_str: string in 'chr:start-end' format 138 | bam_s: pd.Series or dict mapping sample_id->bam_path 139 | bam_index_dir: directory containing local copies of the BAM/CRAM indexes 140 | """ 141 | core.check_dependency('regtools') 142 | 143 | junctions_df = [] 144 | n = len(bam_s) 145 | with mp.Pool(processes=num_threads) as pool: 146 | for k,df in enumerate(pool.imap(regtools_wrapper, [(i,region_str,j,bam_index_dir,strand,user_project) for j,i in bam_s.items()]), 1): 147 | if verbose: 148 | print(f'\r * running regtools junctions extract on region {region_str} for bam {k}/{n}', end='' if k < n else None) 149 | junctions_df.append(df['count'].rename(df.index.name)) 150 | junctions_df = pd.concat(junctions_df, axis=1).infer_objects().fillna(0).astype(np.int32) 151 | junctions_df.index.name = 'junction_id' 152 | return junctions_df 153 | 154 | 155 | def norm_pileups(pileups_df, libsize_s, covariates_df=None, id_map=lambda x: '-'.join(x.split('-')[:2])): 156 | """ 157 | pileups_df: output from samtools_depth() 158 | libsize_s: pd.Series mapping sample_id->library size (total mapped reads) 159 | """ 160 | # convert pileups to reads per million 161 | pileups_rpm_df = pileups_df / libsize_s[pileups_df.columns] * 1e6 162 | pileups_rpm_df.rename(columns=id_map, inplace=True) 163 | 164 | if covariates_df is not None: 165 | residualizer = stats.Residualizer(covariates_df) 166 | pileups_rpm_df = residualizer.transform(pileups_rpm_df) 167 | 168 | return pileups_rpm_df 169 | 170 | 171 | def group_pileups(pileups_df, libsize_s, variant_id, genotypes, covariates_df=None, 172 | id_map=lambda x: '-'.join(x.split('-')[:2])): 173 | """ 174 | pileups_df: output from samtools_depth() 175 | libsize_s: pd.Series mapping sample_id->library size (total mapped reads) 176 | """ 177 | pileups_rpm_df = norm_pileups(pileups_df, libsize_s, covariates_df=covariates_df, id_map=id_map) 178 | 179 | # get genotype dosages 180 | if isinstance(genotypes, str) and genotypes.endswith('.vcf.gz'): 181 | g = gt.get_genotype(variant_id, genotypes)[pileups_rpm_df.columns] 182 | elif isinstance(genotypes, pd.Series): 183 | g = genotypes 184 | else: 185 | raise ValueError('Unsupported format for genotypes.') 186 | 187 | # average pileups by genotype or category 188 | cols = np.unique(g[g.notnull()]).astype(int) 189 | df = pd.concat([pileups_rpm_df[g[g == i].index].mean(axis=1).rename(i) for i in cols], axis=1) 190 | return df 191 | 192 | 193 | def plot(pileup_dfs, gene, mappability_bigwig=None, variant_id=None, order='additive', junctions_df=None, 194 | title=None, plot_variants=None, annot_track=None, max_intron=300, alpha=1, lw=0.5, junction_alpha=0.5, junction_lw=2, 195 | highlight_introns=None, highlight_introns2=None, shade_range=None, colors=None, junction_colors=None, 196 | ymax=None, xlim=None, rasterized=False, outline=False, labels=None, 197 | pc_color='k', nc_color='darkgray', show_cds=True, 198 | dl=0.75, aw=4.5, dr=0.75, db=0.5, ah=1.5, dt=0.25, ds=0.2): 199 | """ 200 | pileup_dfs: 201 | """ 202 | if junction_colors is None and colors is not None: 203 | junction_colors = colors 204 | 205 | if isinstance(pileup_dfs, pd.DataFrame): 206 | pileup_dfs = [pileup_dfs] 207 | num_pileups = len(pileup_dfs) 208 | 209 | nt = len(gene.transcripts) 210 | da = 0.08 * nt + 0.01*(nt-1) 211 | da2 = 0.12 212 | 213 | fw = dl + aw + dr 214 | fh = db + da + ds + (num_pileups-1)*da2 + num_pileups*ah + dt 215 | if mappability_bigwig is not None: 216 | fh += da2 217 | 218 | if variant_id is not None: 219 | chrom, pos, ref, alt = variant_id.split('_')[:4] 220 | pos = int(pos) 221 | if np.issubdtype(pileup_dfs[0].columns.dtype, np.integer): # assume that inputs are genotypes 222 | gtlabels = np.array([ 223 | f'{ref}:{ref}', 224 | f'{ref}:{alt}', 225 | f'{alt}:{alt}', 226 | ]) 227 | else: 228 | gtlabels = None 229 | else: 230 | pos = None 231 | gtlabels = None 232 | 233 | if pileup_dfs[0].shape[1] <= 3: 234 | cycler_colors = [ 235 | # hsv_to_rgb([0.55, 0.75, 0.8]), #(0.2, 0.65, 0.8), # blue 236 | # hsv_to_rgb([0.08, 1, 1]), #(1.0, 0.5, 0.0), # orange 237 | # hsv_to_rgb([0.3, 0.7, 0.7]), #(0.2, 0.6, 0.17), # green 238 | '#0374B3', # blue 239 | '#C84646', # red 240 | '#C69B3A', # gold 241 | ] 242 | else: 243 | cycler_colors = [rgb2hex(i) for i in plt.cm.tab10(np.arange(10))] 244 | custom_cycler = cycler('color', cycler_colors) 245 | 246 | fig = plt.figure(facecolor='none', figsize=(fw,fh)) 247 | ax = fig.add_axes([dl/fw, (db+da+ds)/fh, aw/fw, ah/fh], facecolor='none') 248 | ax.set_prop_cycle(custom_cycler) 249 | axv = [ax] 250 | for i in range(1, num_pileups): 251 | ax = fig.add_axes([dl/fw, (db+da+ds+i*(da2+ah))/fh, aw/fw, ah/fh], facecolor='none', sharex=axv[0]) 252 | ax.set_prop_cycle(custom_cycler) 253 | axv.append(ax) 254 | 255 | s = pileup_dfs[0].sum() 256 | if isinstance(order, list): 257 | sorder = order 258 | elif order == 'additive': 259 | sorder = s.index 260 | if s[sorder[0]] < s[sorder[-1]]: 261 | sorder = sorder[::-1] 262 | elif order == 'sorted': 263 | sorder = np.argsort(s)[::-1] 264 | elif order == 'none': 265 | sorder = s.index 266 | 267 | gene.set_plot_coords(max_intron=max_intron) 268 | for k,ax in enumerate(axv): 269 | xi = gene.map_pos(pileup_dfs[k].index) 270 | for j,i in enumerate(sorder): 271 | if i in pileup_dfs[k]: 272 | if outline: 273 | if colors is not None: 274 | c = colors[i] 275 | else: 276 | c = cycler_colors[j] 277 | ax.plot(xi, pileup_dfs[k][i], color=c, label=i, lw=lw, alpha=alpha, rasterized=rasterized) 278 | else: 279 | ax.fill_between(xi, pileup_dfs[k][i], label=i, alpha=alpha, rasterized=rasterized) 280 | 281 | if labels is None: 282 | labels = ['Mean RPM'] * num_pileups 283 | # format 284 | for k,ax in enumerate(axv): 285 | ax.margins(0) 286 | ax.set_ylabel(labels[k], fontsize=12) 287 | qtl_plot.format_plot(ax, fontsize=10) 288 | ax.tick_params(axis='x', length=3, width=0.6, pad=1) 289 | ax.set_xticks(gene.map_pos(gene.get_collapsed_coords().reshape(1,-1)[0])) 290 | ax.set_xticklabels([]) 291 | ax.spines['left'].set_position(('outward', 6)) 292 | 293 | if xlim is not None: 294 | ax.set_xlim(xlim) 295 | if ymax is not None: 296 | ax.set_ylim([0, ymax]) 297 | if gtlabels is not None: 298 | gtlabels = gtlabels[sorder] 299 | handles, _ = axv[-1].get_legend_handles_labels() 300 | # leg = axv[-1].legend(handles[::-1], gtlabels[::-1], loc='upper left', handlelength=0.75, handletextpad=0.5, bbox_to_anchor=(1.02,1), 301 | # labelspacing=0.2, borderaxespad=0, fontsize=10) 302 | leg = axv[-1].legend(handles[::-1], gtlabels[::-1], loc='lower left', handlelength=0.75, handletextpad=0.5, 303 | labelspacing=0.2, borderaxespad=None, fontsize=10)#, framealpha=1)#, facecolor=(1, 0, 1, 0)) 304 | 305 | for line in leg.get_lines(): 306 | line.set_linewidth(1.5) 307 | 308 | if plot_variants is not None and len(plot_variants) > 1: 309 | axv[-1].add_artist(leg)#, clip_on=False) 310 | 311 | if variant_id is not None and title is None: 312 | axv[-1].set_title(f"{gene.name} :: {variant_id.split('_b')[0].replace('_',':',1).replace('_','-')}", fontsize=11) 313 | else: 314 | axv[-1].set_title(title, fontsize=11) 315 | 316 | # plot variant(s) 317 | def _plot_variant(x, color='tab:red'): 318 | for ax in axv: 319 | xlim = np.diff(ax.get_xlim())[0] 320 | ylim = np.diff(ax.get_ylim())[0] 321 | h = 0.04 * ylim 322 | b = h/np.sqrt(3) * ah/aw * xlim/ylim 323 | v = np.array([[x-b, -h-0.01*ylim], [x+b, -h-0.01*ylim], [x, -0.01*ylim]]) 324 | ax.add_patch(patches.Polygon(v, closed=True, color=color, ec='k', lw=0.5, clip_on=False, zorder=10)) 325 | 326 | if isinstance(plot_variants, str): 327 | x = gene.map_pos(int(plot_variants.split('_')[1])) 328 | _plot_variant(x) 329 | elif isinstance(plot_variants, Iterable): 330 | for i in plot_variants: 331 | ipos = int(i.split('_')[1]) 332 | x = gene.map_pos(ipos) 333 | if pos is not None and ipos == pos: 334 | _plot_variant(x, color='tab:red') 335 | else: 336 | _plot_variant(x, color='tab:orange') 337 | elif plot_variants == True and pos is not None: 338 | x = gene.map_pos(pos) 339 | _plot_variant(x) 340 | 341 | if plot_variants is not None: 342 | kwargs = {'ec':'k', 'lw':0.5, 's':20, 'marker':'^'} 343 | h1 = ax.scatter(np.nan, np.nan, fc='tab:red', **kwargs, label='Lead') 344 | h2 = ax.scatter(np.nan, np.nan, fc='tab:orange', **kwargs, label='Other') 345 | if len(plot_variants) > 1: 346 | ax.legend(handles=[h1,h2], loc='lower left', title='CS variants', 347 | handlelength=1, handletextpad=0.5, borderaxespad=0, bbox_to_anchor=(1.02, 0)) 348 | 349 | ax.set_ylim([0, ax.get_ylim()[1]]) 350 | 351 | # plot highlight/shading 352 | if shade_range is not None: 353 | if isinstance(shade_range, str): 354 | shade_range = shade_range.split(':')[-1].split('-') 355 | shade_range = np.array(shade_range).astype(int) 356 | shade_range = gene.map_pos(shade_range) 357 | for k in range(len(shade_range)-1): 358 | axv[-1].add_patch(patches.Rectangle((shade_range[k], 0), shade_range[k+1]-shade_range[k], ax.get_ylim()[1], 359 | facecolor=[0.8]*3 if k % 2 == 0 else [0.9]*3, zorder=-10)) 360 | 361 | # add gene model 362 | gax = fig.add_axes([dl/fw, db/fh, aw/fw, da/fh], sharex=axv[0]) 363 | gene.plot(ax=gax, max_intron=max_intron, wx=0.2, highlight_introns=highlight_introns, 364 | highlight_introns2=highlight_introns2, ec='none', clip_on=True, 365 | pc_color=pc_color, nc_color=nc_color, show_cds=show_cds) 366 | gax.set_title('') 367 | if nt < 3: 368 | gax.set_ylabel('Isoforms', fontsize=10, rotation=0, ha='right', va='center') 369 | else: 370 | gax.set_ylabel('Isoforms', fontsize=10, labelpad=15) 371 | plt.setp(gax.get_xticklabels(), visible=False) 372 | plt.setp(gax.get_yticklabels(), visible=False) 373 | for s in ['top', 'right', 'bottom', 'left']: 374 | gax.spines[s].set_visible(False) 375 | gax.tick_params(length=0, labelbottom=False) 376 | axv.append(gax) 377 | 378 | if mappability_bigwig is not None: # add mappability 379 | xi = gene.map_pos(pileup_dfs[0].index) 380 | # c = gene.get_coverage(mappability_bigwig) 381 | with pyBigWig.open(mappability_bigwig) as bw: 382 | c = bw.values(gene.chr, int(pileup_dfs[0].index[0]-1), int(pileup_dfs[0].index[-1]), numpy=True) 383 | mpax = fig.add_axes([dl/fw, 0.25/fh, aw/fw, da2/fh], sharex=axv[0]) 384 | mpax.fill_between(xi, c, color=3*[0.6], lw=1, interpolate=False, rasterized=rasterized) 385 | for i in ['top', 'right']: 386 | mpax.spines[i].set_visible(False) 387 | mpax.spines[i].set_linewidth(0.6) 388 | mpax.set_ylabel('Map.', fontsize=10, rotation=0, ha='right', va='center') 389 | mpax.tick_params(axis='x', length=0, labelbottom=False) 390 | mpax.tick_params(axis='y', labelsize=8) 391 | mpax.spines['left'].set_position(('outward', 6)) 392 | axv.append(mpax) 393 | plt.sca(axv[0]) 394 | 395 | if annot_track is not None: 396 | tax = fig.add_axes([dl/fw, 0/fh, aw/fw, da2/fh], sharex=axv[0]) 397 | gene.plot_coverage(coverage=annot_track, ax=tax, max_intron=max_intron) 398 | tax.tick_params(length=0, labelbottom=False) 399 | # axv[-1].set_xlabel(f'Exon coordinates on {gene.chr}', fontsize=12) 400 | 401 | # need to plot last since this is plotted in a separate set of axes 402 | if junctions_df is not None: 403 | junctions_df = junctions_df.copy() 404 | junctions_df['start'] = junctions_df.index.map(lambda x: int(x.split(':')[-1].split('-')[0])) 405 | junctions_df['end'] = junctions_df.index.map(lambda x: int(x.split(':')[-1].split('-')[1])) 406 | for k,i in enumerate(sorder): 407 | s = pileup_dfs[0][i].copy() 408 | if junction_colors is not None: 409 | ec = junction_colors[i] 410 | else: 411 | ec = cycler_colors[k] 412 | gene.plot_junctions(ax, junctions_df, s, show_counts=False, align='minimum', count_col=i, 413 | h=0.3, lw=junction_lw, lw_fct=np.sqrt, ec=ec, alpha=junction_alpha, clip_on=True) 414 | 415 | return axv 416 | -------------------------------------------------------------------------------- /qtl/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats 4 | import re 5 | import matplotlib.pyplot as plt 6 | import matplotlib.patches as patches 7 | import matplotlib.ticker as ticker 8 | import matplotlib.colors as colors 9 | from matplotlib.colors import hsv_to_rgb, ListedColormap, LogNorm 10 | import seaborn as sns 11 | import scipy.cluster.hierarchy as hierarchy 12 | from cycler import cycler 13 | from collections.abc import Iterable 14 | import copy 15 | import itertools 16 | 17 | from . import stats 18 | from . import map as qtl_map 19 | 20 | 21 | def setup_figure(aw=4.5, ah=3, xspace=[0.75,0.25], yspace=[0.75,0.25], 22 | colorbar=None, ds=0.15, cw=0.12, ct=0, ch=None, 23 | margin_axes=None, mx=0.5, dx=0.15, my=0.5, dy=0.15, polar=False): 24 | """ 25 | """ 26 | dl, dr = xspace 27 | db, dt = yspace 28 | fw = dl + aw + dr 29 | fh = db + ah + dt 30 | if margin_axes in ['x', 'both']: 31 | fw += dx + mx 32 | if margin_axes in ['y', 'both']: 33 | fh += dy + my 34 | fig = plt.figure(facecolor='none', figsize=(fw,fh)) 35 | axes = [fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh], facecolor='none', zorder=1, polar=polar)] 36 | if margin_axes in ['y', 'both']: 37 | axes.append(fig.add_axes([dl/fw, (db+ah+dy)/fh, aw/fw, my/fh], sharex=axes[0], facecolor='none', zorder=0)) 38 | if margin_axes in ['x', 'both']: 39 | axes.append(fig.add_axes([(dl+aw+dx)/fw, db/fh, mx/fw, ah/fh], sharey=axes[0], facecolor='none', zorder=0)) 40 | if colorbar != 'horizontal': 41 | dl += dx + mx 42 | if colorbar == 'horizontal': 43 | if ch is None: 44 | ch = 0.4 * aw 45 | axes.append(fig.add_axes([(dl+aw-ch)/fw, (db+ah+ds)/fh, ch/fw, cw/fh], facecolor='none')) 46 | elif colorbar is not None and colorbar != False: # vertical 47 | if ch is None: 48 | ch = 0.4 * ah 49 | axes.append(fig.add_axes([(dl+aw+ds)/fw, (db+ah-ch-ct)/fh, cw/fw, ch/fh], facecolor='none')) 50 | 51 | if len(axes) == 1: 52 | axes = axes[0] 53 | return axes 54 | 55 | 56 | # if not box: 57 | # ax.spines['left'].set_position(('outward', 6)) 58 | # ax.spines['bottom'].set_position(('outward', 6)) 59 | # ax.spines['right'].set_visible(False) 60 | # ax.spines['top'].set_visible(False) 61 | # ax.tick_params(axis='both', which='both', direction='out', labelsize=fontsize) 62 | 63 | def get_axgrid(nr, nc, ntot=None, sharex=False, sharey=False, 64 | x_offset=6, y_offset=6, margins=None, polar=False, 65 | background_axes=False, 66 | dl=0.5, aw=2, dx=0.75, dr=0.25, 67 | db=0.5, ah=2, dy=0.75, dt=0.25, 68 | colorbar=None, ds=0.15, cw=0.15, ct=0, ch=None, 69 | tri=None, fontsize=10, hide=['top', 'right']): 70 | """ 71 | """ 72 | if ntot is None: 73 | ntot = nr * nc 74 | 75 | if not isinstance(aw, Iterable): 76 | aw = nc * [aw] 77 | 78 | if not isinstance(polar, Iterable): 79 | polar = ntot * [polar] 80 | 81 | fw = dl + sum(aw) + (nc-1)*dx + dr 82 | fh = db + nr*ah + (nr-1)*dy + dt 83 | fig = plt.figure(figsize=(fw,fh), facecolor='none') 84 | axes = [] 85 | n = 0 86 | 87 | if tri is None: 88 | si = lambda x: 0 89 | elif tri == 'upper': 90 | si = lambda x: x 91 | 92 | for j in range(nr): 93 | for i in range(si(j), nc): 94 | if n < ntot: 95 | ax = fig.add_axes([(dl+sum(aw[:i])+i*dx)/fw, (db+(nr-j-1)*(ah+dy))/fh, aw[i]/fw, ah/fh], facecolor='none', zorder=0, polar=polar[n], 96 | sharex=axes[0] if sharex and n>0 else None, 97 | sharey=axes[0] if sharey and n>0 else None) 98 | if not polar[n]: 99 | format_plot(ax, fontsize=fontsize, hide=hide, x_offset=x_offset, y_offset=y_offset) 100 | ax.margins(margins) 101 | axes.append(ax) 102 | n += 1 103 | 104 | if ch is None: 105 | ch = ah/2 106 | 107 | # add axes in background for plotting overlays 108 | if background_axes: 109 | bax = fig.add_axes([dl/fw, db/fh, (sum(aw) + (nc-1)*dx)/fw, (nr*ah + (nr-1)*dy)/fh], 110 | facecolor='none', zorder=-1, label='background', 111 | sharex=axes[0] if sharex and nc == 1 else None, 112 | sharey=axes[0] if sharey and nr == 1 else None) 113 | format_plot(bax, hide=['top', 'right', 'bottom', 'left']) 114 | hide_ticks(bax) 115 | bax.margins(0) 116 | else: 117 | bax = None 118 | 119 | # add colorbars 120 | if isinstance(colorbar, Iterable): 121 | cax = [] 122 | for k in colorbar: 123 | i = k // nc # row 124 | j = k - i*nc # col 125 | cax.append(fig.add_axes([(dl+sum(aw[:j+1])+j*dx+ds)/fw, (db+(nr-i)*ah+(nr-i-1)*dy-ch-ct)/fh, cw/fw, ch/fh])) 126 | elif colorbar == True: 127 | cax = fig.add_axes([(dl+sum(aw)+(nc-1)*dx+ds)/fw, (db+nr*ah+(nr-1)*dy-ch-ct)/fh, cw/fw, ch/fh]) 128 | else: 129 | cax = None 130 | r = [axes] 131 | if cax is not None: 132 | r.append(cax) 133 | if bax is not None: 134 | r.append(bax) 135 | if len(r) == 1: 136 | r = r[0] 137 | else: 138 | r = tuple(r) 139 | return r 140 | 141 | 142 | def hide_ticks(ax, axis='both'): 143 | if axis in ['x', 'both']: 144 | plt.setp(ax.get_xticklabels(), visible=False) 145 | for line in ax.xaxis.get_ticklines(): 146 | line.set_markersize(0) 147 | line.set_markeredgewidth(0) 148 | if axis in ['y', 'both']: 149 | plt.setp(ax.get_yticklabels(), visible=False) 150 | for line in ax.yaxis.get_ticklines(): 151 | line.set_markersize(0) 152 | line.set_markeredgewidth(0) 153 | 154 | 155 | def format_plot(ax, tick_direction='out', tick_length=4, hide=['top', 'right'], 156 | hide_spines=True, lw=1, fontsize=10, 157 | equal_limits=False, x_offset=0, y_offset=0, vmin=None): 158 | 159 | # ax.autoscale(False) 160 | for i in ['left', 'bottom', 'right', 'top']: 161 | ax.spines[i].set_linewidth(lw) 162 | 163 | ax.tick_params(axis='both', which='both', direction=tick_direction, labelsize=fontsize) 164 | 165 | # set tick positions 166 | if 'top' in hide and 'bottom' in hide: 167 | ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) 168 | elif 'top' in hide: 169 | ax.get_xaxis().set_ticks_position('bottom') 170 | elif 'bottom' in hide: 171 | ax.get_xaxis().set_ticks_position('top') 172 | else: 173 | ax.get_xaxis().set_ticks_position('both') 174 | 175 | if 'left' in hide and 'right' in hide: 176 | ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=False) 177 | elif 'left' in hide: 178 | ax.get_yaxis().set_ticks_position('right') 179 | elif 'right' in hide: 180 | ax.get_yaxis().set_ticks_position('left') 181 | elif len(hide) == 0: 182 | ax.get_xaxis().set_ticks_position('bottom') 183 | ax.get_yaxis().set_ticks_position('left') 184 | else: 185 | ax.get_yaxis().set_ticks_position('both') 186 | 187 | if hide_spines: 188 | for i in hide: 189 | ax.spines[i].set_visible(False) 190 | 191 | # adjust tick size 192 | for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines(): 193 | line.set_markersize(tick_length) 194 | line.set_markeredgewidth(lw) 195 | 196 | for line in (ax.xaxis.get_ticklines(minor=True) + ax.yaxis.get_ticklines(minor=True)): 197 | line.set_markersize(tick_length/2) 198 | line.set_markeredgewidth(lw/2) 199 | 200 | ax.spines['left'].set_position(('outward', y_offset)) 201 | ax.spines['bottom'].set_position(('outward', x_offset)) 202 | 203 | if equal_limits: 204 | xlim = ax.get_xlim() 205 | ylim = ax.get_ylim() 206 | lims = [np.minimum(xlim[0], ylim[0]), np.maximum(xlim[1], ylim[1])] 207 | if vmin is not None: 208 | lims[0] = vmin 209 | ax.set_xlim(lims) 210 | ax.set_ylim(lims) 211 | 212 | # ax.autoscale(True) # temporary fix? 213 | 214 | 215 | def plot_qtl(g, p, label_s=None, label_colors=None, split=False, split_colors=None, covariates_df=None, 216 | legend_text=None, show_pval=False, show_slope=False, normalized=False, loc=None, ax=None, color=[0.5]*3, 217 | variant_id=None, jitter=0, bvec=None, boxplot=False, xlabel=None, clip_on=True, 218 | aw=2, ah=2, dr=0.25, dt=0.25, db=0.5, dl=0.75, width=0.75, lw=1, 219 | ylabel='Normalized expression', title=None, show_counts=True): 220 | """""" 221 | 222 | assert p.index.equals(g.index) 223 | 224 | if show_pval or show_slope: 225 | stats_s = qtl_map.calculate_association(g, p, covariates_df=covariates_df).iloc[0] 226 | 227 | if covariates_df is not None: 228 | # only residualize the phenotype for plotting 229 | p = stats.residualize(p.copy(), covariates_df.loc[p.index]) 230 | 231 | qtl_df = pd.concat([g.round().astype(int), p], axis=1) 232 | qtl_df.columns = ['genotype', 'phenotype'] 233 | if label_s is not None: 234 | qtl_df = pd.concat([qtl_df, label_s], axis=1, sort=False) 235 | 236 | if ax is None: 237 | ax = setup_figure(aw, ah, xspace=[dl, dr], yspace=[db, dt]) 238 | 239 | if not normalized: 240 | if split and label_s is not None: 241 | if split_colors is None: 242 | split_colors = [ 243 | hsv_to_rgb([0.025, 1, 0.8]), 244 | hsv_to_rgb([0.575, 1, 0.8]) 245 | ] 246 | pal = sns.color_palette(split_colors) 247 | i = qtl_df.columns[2] 248 | sns.violinplot(x="genotype", y="phenotype", hue=i, hue_order=sorted(qtl_df[i].unique()), 249 | data=qtl_df, palette=pal, ax=ax, order=[0,1,2], density_norm='width', cut=0, linewidth=lw, width=width) 250 | l = ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1), fontsize=8, handlelength=0.6, ncol=2, handletextpad=0.5, labelspacing=0.33) 251 | l.set_title(None) 252 | else: 253 | sns.violinplot(x="genotype", y="phenotype", data=qtl_df, width=width, 254 | cut=0, ax=ax, order=[0,1,2], color=color, linewidth=lw, clip_on=clip_on) 255 | else: 256 | pass 257 | # if labels is not None: 258 | # ax.scatter(g, p, c=labels, cmap=colors.LinearSegmentedColormap.from_list('m', label_colors), alpha=0.8, s=25, edgecolors='none') 259 | # else: 260 | # # ax.scatter(g, p, c=hsv_to_rgb([0.55,0.8,0.8]), alpha=0.8, s=25, edgecolors='none') 261 | # ax.scatter(g, p, c='k', alpha=0.66, s=25, edgecolors='none') 262 | 263 | ax.set_xlabel(xlabel, fontsize=12, labelpad=8) 264 | ax.set_ylabel(ylabel, fontsize=12) 265 | format_plot(ax, lw=1, fontsize=9, x_offset=6, y_offset=6) 266 | ax.set_xlim([-0.5,2.5]) 267 | ax.spines['bottom'].set_bounds([0, 2]) 268 | ax.yaxis.set_major_locator(ticker.MaxNLocator(min_n_ticks=5, nbins=5)) 269 | 270 | if show_slope: 271 | x = np.array([-0.5, 2.5]) 272 | r = scipy.stats.linregress(g, p) 273 | ax.plot(x, r.intercept + x*r.slope, linestyle=(0, (3, 2)), lw=2, color='tab:blue') 274 | 275 | if title is not None: 276 | ax.set_title(title, fontsize=12) 277 | 278 | if variant_id is not None: 279 | ref, alt = variant_id.split('_')[2:4] 280 | labels = [ 281 | f'{ref}/{ref}', 282 | f'{ref}/{alt}', 283 | f'{alt}/{alt}', 284 | ] 285 | else: 286 | labels = [0, 1, 2] 287 | 288 | if show_counts: 289 | if not split: 290 | gcounts = g.astype(int).value_counts() 291 | labels = [f"{v}\n{gcounts.get(k, 0)}" for k,v in enumerate(labels)] 292 | else: 293 | var_s = qtl_df[qtl_df.columns[2]] 294 | c = np.unique(var_s) 295 | assert len(c) == 2 296 | 297 | gcounts1 = g[var_s == c[0]].value_counts().reindex(np.arange(3), fill_value=0) 298 | gcounts2 = g[var_s == c[1]].value_counts().reindex(np.arange(3), fill_value=0) 299 | labels = [f"{v}\n({gcounts1[k]},{gcounts2[k]})" for k,v in enumerate(labels)] 300 | ax.set_xticks(range(len(labels)), labels) 301 | 302 | if show_pval: 303 | if stats_s['slope'] > 0: 304 | ax.text(0.05, 1, f"P = {stats_s['pval_nominal']:.2g}", va='top', ha='left', transform=ax.transAxes, fontsize=11) 305 | else: 306 | ax.text(0.95, 1, f"P = {stats_s['pval_nominal']:.2g}", va='top', ha='right', transform=ax.transAxes, fontsize=11) 307 | 308 | return ax 309 | 310 | 311 | def plot_interaction(p, g, i, variant_id=None, annot=None, covariates_df=None, lowess=None, 312 | xlabel=None, ylabel=None, title=None, alpha=0.8, s=20, fontsize=14, 313 | ah=3, aw=3): 314 | """ 315 | Plot interaction QTL 316 | 317 | Model: 318 | p = b0 + b1*g + b2*i + b3*gi 319 | 320 | Args: 321 | lowess: fraction of data to use [0,1] 322 | """ 323 | 324 | assert p.index.equals(g.index) and p.index.equals(i.index) 325 | 326 | if covariates_df is not None: 327 | assert p.index.equals(covariates_df.index) 328 | X = np.c_[len(g)*[1],g,i,g*i,covariates_df] 329 | else: 330 | X = np.c_[len(g)*[1],g,i,g*i] 331 | b,_,_,_ = np.linalg.lstsq(X, p, rcond=None) 332 | 333 | if variant_id is not None: 334 | ref, alt = variant_id.split('_')[2:4] 335 | else: 336 | ref, alt = 'ref', 'alt' 337 | labels = { 338 | 0: f'{ref}/{ref}', 339 | 1: f'{ref}/{alt}', 340 | 2: f'{alt}/{alt}', 341 | } 342 | 343 | ax = setup_figure(ah, aw) 344 | ax.margins(0.02) 345 | 346 | custom_cycler = cycler('color', [ 347 | # hsv_to_rgb([0.55,1,0.8]), 348 | # sns.color_palette("Paired")[7], # orange 349 | # hsv_to_rgb([0,1,0.8]), 350 | sns.color_palette("husl", 8)[5], # blue 351 | sns.color_palette("Paired")[7], # orange 352 | sns.color_palette("Paired")[3], # green 353 | ]) 354 | ax.set_prop_cycle(custom_cycler) 355 | 356 | gorder = [0,1,2] 357 | # gorder = [2,1,0] 358 | # mu = [p[g==g0].mean() for g0 in np.unique(g)] 359 | # if mu[0]= ld_threshold] 426 | X = v[['row', 'col']].copy().values.T 427 | X[1,:] -= start_pos 428 | x0 = np.array([[start_pos, 0]]).T 429 | R = np.array([[1, 1], [-1, 1]])/np.sqrt(2) 430 | 431 | # set up figure 432 | if ax is None: 433 | pad = 0.1 434 | dl = pad 435 | aw = 8 436 | dr = 0.5 437 | db = 0.5 438 | ah = aw/yscale # must also scale ylim below 439 | dt = pad 440 | fw = dl+aw+dr 441 | fh = db+ah+dt 442 | ds = 0.1 443 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 444 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 445 | cax = fig.add_axes([(dl+aw+ds)/fw, db/fh, 0.1/fw, 0.8/fh]) 446 | 447 | # plot 448 | X = np.dot(R, X-x0)/np.sqrt(2) + x0 449 | order = np.argsort(v[0]) 450 | h = ax.scatter(X[0,order]/xunit, X[1,order]/xunit, s=s, c=v[0].iloc[order], marker='D', clip_on=clip_on, 451 | alpha=alpha, edgecolor='none', cmap=cmap, vmin=0, vmax=1, rasterized=rasterized) 452 | 453 | if cax is not None: 454 | hc = plt.colorbar(h, cax=cax) 455 | hc.set_label('$\mathregular{R^2}$', fontsize=12, rotation=0, ha='left', va='center') 456 | hc.locator = ticker.MaxNLocator(min_n_ticks=3, nbins=2) 457 | xlim = np.array([start_pos, end_pos]) / xunit 458 | ax.set_xlim(xlim) 459 | ax.set_ylim([-np.diff(xlim)[0]/yscale, 0]) 460 | 461 | for s in ['left', 'top', 'right']: 462 | ax.spines[s].set_visible(False) 463 | ax.set_yticks([]) 464 | 465 | ax.set_xlabel(f"Position on {variant_df['chr'].iloc[0]} (Mb)", fontsize=14) 466 | 467 | if ld_bounds is not None: 468 | ci = (ld_bounds[:-1] + ld_bounds[1:]) / 2 # center position for each block 469 | y = -np.diff(ld_bounds) / 2 / xunit 470 | yi = np.array([i for i in itertools.chain(*itertools.zip_longest(np.zeros(len(ld_bounds)), y)) if i is not None]) 471 | xi = np.array([i for i in itertools.chain(*itertools.zip_longest(ld_bounds, ci)) if i is not None]) 472 | ax.plot(xi/xunit, yi, linestyle=(0, (4, 3)), lw=1, color='tab:red') 473 | 474 | return ax 475 | 476 | 477 | def plot_locus_summary(region_str, tracks_dict=None, ld_df=None, ld_bounds=None, coverage_cat=None, 478 | track_colors=None, labels=None, order=None, 479 | pip_df=None, pip_order=None, pip_colors=None, pip_legend=False, 480 | gene=None, ld_marker_size=1, aw=6, ah=4, dl=2, dr=1, ph=0.1, gh=0.15): 481 | """ 482 | Visualization of genetic locus, combining coverage tracks (e.g., ATAC-seq), 483 | variants (e.g., fine-mapped QTLs), genes/transcripts, and LD. 484 | 485 | Inputs: 486 | pip_df: variants with PIPs from fine-mapping 487 | tracks_dict: tracks 488 | ld_df: LD matrix 489 | """ 490 | 491 | if isinstance(tracks_dict, dict): 492 | ntracks = len(tracks_dict) 493 | ah = ntracks*0.2 494 | elif isinstance(tracks_dict, pd.DataFrame): 495 | ntracks = tracks_dict.shape[1] 496 | else: 497 | ntracks = 0 498 | ah = 0 499 | 500 | if gene is not None: 501 | if not isinstance(gene, Iterable): 502 | gene = [gene] 503 | gh = gh * len(gene) # gene model 504 | else: 505 | gh = 0 506 | 507 | db = 0.5 508 | ldh = 1 # LD plot 509 | ds0 = 0 510 | ds = 0.1 511 | if pip_df is None: 512 | ph = 0 513 | elif isinstance(pip_df, pd.DataFrame): 514 | pip_df = [pip_df] 515 | pah = [ph*df['trait_id'].nunique() for df in pip_df] 516 | 517 | dt = 0.25 518 | fw = dl + aw + dr 519 | fh = db + ldh + ds0 + gh + ds + sum(pah) + ds*(len(pah)-1) + ds + ah + dt 520 | fig = plt.figure(figsize=(fw,fh)) 521 | axes = [] 522 | if tracks_dict is not None: 523 | ax = fig.add_axes([dl/fw, (db+ldh+ds0+gh+ds+sum(pah)+len(pah)*ds)/fh, aw/fw, ah/fh], facecolor='none') 524 | axes.append(ax) 525 | if pip_df is not None: 526 | faxes = [] 527 | for k,h in enumerate(pah): 528 | fax = fig.add_axes([dl/fw, (db+ldh+ds0+gh+ds+(len(pah)-1-k)*ds+sum(pah[k+1:]))/fh, aw/fw, h/fh], 529 | facecolor='none', sharex=axes[0] if len(axes)>0 else None) 530 | faxes.append(fax) 531 | axes.append(fax) 532 | if gene is not None: 533 | gax = fig.add_axes([dl/fw, (db+ldh+ds0)/fh, aw/fw, gh/fh], facecolor='none') 534 | axes.append(gax) 535 | if ld_df is not None: 536 | lax = fig.add_axes([dl/fw, db/fh, aw/fw, ldh/fh], facecolor='none', sharex=axes[0] if len(axes)>0 else None) 537 | lcax = fig.add_axes([(dl+aw+ds)/fw, (db+ldh/2)/fh, 0.1/fw, ldh/2/fh], facecolor='none') 538 | axes.append(lax) 539 | 540 | chrom, start_pos, end_pos = re.split(':|-', region_str) 541 | start_pos = int(start_pos) 542 | end_pos = int(end_pos) 543 | x = np.arange(start_pos, end_pos+1) / 1e6 544 | 545 | if order is not None: 546 | labels = order 547 | elif tracks_dict is not None: 548 | labels = list(tracks_dict.keys()) 549 | 550 | if isinstance(tracks_dict, dict): 551 | #maxv = np.max([np.max(tracks_dict[k]) for k in labels]) 552 | for k, label in enumerate(labels): 553 | c = tracks_dict[label] 554 | y0 = (ntracks-1-k) * np.ones(len(x)) # vertical offset 555 | # if label in track_colors: 556 | # color = track_colors[label] 557 | # else: 558 | # color = 'k' 559 | # ax.fill_between(x, 0.8*c/np.nanmax(c)+y0, y0, 560 | # ax.fill_between(x, 0.8*c/15+y0, y0, 561 | maxv = np.max(c) 562 | ax.fill_between(x, 0.9*c/maxv + y0, y0, antialiased=False, linewidth=1, 563 | facecolor=track_colors.get(label, 'k') if track_colors is not None else 'k', 564 | clip_on=True, rasterized=True) 565 | 566 | ax.set_yticks(np.arange(ntracks)) 567 | ax.set_yticklabels([i.replace('_', ' ') for i in labels[::-1]], fontsize=8, va='bottom') 568 | ax.spines['left'].set_bounds((0, ntracks-1)) 569 | ax.spines['left'].set_position(('outward', 6)) 570 | format_plot(ax, fontsize=8, hide=['top', 'right', 'bottom'], y_offset=6) 571 | ax.set_ylim([-0.5, ntracks-0.5]) 572 | 573 | elif isinstance(tracks_dict, pd.DataFrame): # plot as heatmap 574 | ax.invert_yaxis() 575 | ax.imshow(np.log10(tracks_dict.values.T), 576 | #extent=(), 577 | aspect='auto', interpolation='none') 578 | ax2 = fig.add_axes([(dl-0.2)/fw, (db+ldh+ds0+gh+ds+ph+ds)/fh, 0.1/fw, ah/fh], sharey=ax) 579 | # format_plot(ax, fontsize=8, hide=['top', 'right', 'bottom', 'left']) 580 | format_plot(ax2, fontsize=8, hide=['top', 'right', 'bottom', 'left']) 581 | ax2.set_xticks([]) 582 | 583 | cohort_index_dict = {i:k for k,i in enumerate(np.unique(coverage_cat))} 584 | # if cohort_colors is None: 585 | n = len(cohort_index_dict) 586 | cmap = ListedColormap(plt.cm.get_cmap('Spectral', n)(np.arange(n)), 'indexed') 587 | # else: 588 | # cmap = ListedColormap(np.stack(pd.Series(cohort_index_dict).sort_values().index.map(cohort_colors))) 589 | 590 | # if orientation == 'vertical': 591 | ax2.imshow(coverage_cat.map(cohort_index_dict).values.reshape(-1,1), aspect='auto', cmap=cmap) 592 | # else: 593 | # ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(1,-1), aspect='auto', cmap=cmap) 594 | # 595 | # if lax is None: 596 | # lax = ax 597 | for k,i in cohort_index_dict.items(): 598 | ax.scatter(np.nan, np.nan, marker='s', c=[cmap(i)], label=f'{k}') 599 | # if legend: 600 | ax.legend(loc='upper right', borderaxespad=None, bbox_to_anchor=(-0.05,1), handlelength=1, title='Taxonomy', fontsize=8) 601 | ax.set_ylim([ntracks-0.5, -0.5]) 602 | ax.set_yticks([]) 603 | 604 | if tracks_dict is not None: 605 | plt.setp(ax.get_xticklabels(), visible=False) 606 | if len(axes) > 0: 607 | axes[0].set_xlim([x[0], x[-1]]) 608 | 609 | if pip_df is not None: 610 | for k, df in enumerate(pip_df): 611 | fax = faxes[k] 612 | pip_order = df['trait_category'].unique() # TODO: add back as option? 613 | i = 0 614 | traits = [] 615 | for category_id in pip_order: 616 | cdf = df[df['trait_category'] == category_id] 617 | for i,(trait_id,gdf) in enumerate(cdf.groupby('trait_id', observed=True), i+1): 618 | traits.append(trait_id) 619 | fax.scatter(gdf['position']/1e6, [i-1]*gdf.shape[0], s=30*gdf['pip'], 620 | color=pip_colors.get(category_id, 'k') if pip_colors is not None else 'k', edgecolor='none', clip_on=False) 621 | if cdf.shape[0] > 0: 622 | fax.scatter(np.nan, np.nan, s=20, color=pip_colors.get(category_id, 'k') if pip_colors is not None else 'k', label=category_id, edgecolor='none') 623 | fax.invert_yaxis() 624 | fax.set_yticks(np.arange(len(traits))) 625 | 626 | fax.spines['bottom'].set_visible(False) 627 | fax.spines['top'].set_visible(False) 628 | fax.spines['left'].set_position(('outward', 6)) 629 | fax.spines['left'].set_bounds((0, i-1)) 630 | fax.spines['right'].set_visible(False) 631 | fax.set_xlim([x[0], x[-1]]) 632 | fax.set_yticklabels(traits, fontsize=7) 633 | plt.setp(fax.get_xticklabels(), visible=False) 634 | for line in fax.xaxis.get_ticklines(): 635 | line.set_markersize(0) 636 | line.set_markeredgewidth(0) 637 | 638 | if pip_legend == True: 639 | fax.legend(loc='upper left', borderaxespad=0, borderpad=0.25, bbox_to_anchor=(1.01,1), fontsize=8, handlelength=0.75, handletextpad=0.5, labelspacing=0) 640 | 641 | 642 | if gene is not None: 643 | for k,g in enumerate(gene[::-1]): 644 | g.plot(ax=gax, max_intron=1e9, pc_color='k', nc_color='k', ec='none', yoffset=k, scale=0.33, clip_on=True) 645 | gax.annotate(g.name, (g.end_pos, k), 646 | xytext=(5,0), textcoords='offset points', 647 | va='center', ha='left', fontsize=10) 648 | xlim = np.array([x[0], x[-1]])*1e6 649 | gax.set_xlim(xlim) 650 | gax.set_xticks([]) 651 | gax.set_yticks([]) 652 | gax.spines['bottom'].set_visible(False) 653 | gax.spines['top'].set_visible(False) 654 | gax.spines['left'].set_visible(False) 655 | gax.spines['right'].set_visible(False) 656 | 657 | if ld_df is not None: 658 | format_plot(lax, fontsize=10) 659 | plot_ld(ld_df, ld_bounds=ld_bounds, start_pos=start_pos, end_pos=end_pos, cmap=plt.cm.Greys, 660 | s=ld_marker_size, clip_on=True, yscale=aw/ldh, ax=lax, cax=lcax) 661 | 662 | if len(axes) > 0: 663 | axes[-1].set_xlabel(f"Position on {chrom} (Mb)", fontsize=12) 664 | 665 | return axes 666 | 667 | 668 | def plot_effects(dfs, args, ax=None, 669 | xspace=[2.25,2,0.5], yspace=[0.5,3,0.5], xlim=None, 670 | xlabel='log$\mathregular{_{2}}$(Fold enrichment)', ylabel=None): 671 | """""" 672 | 673 | if isinstance(dfs, pd.DataFrame): 674 | dfs = [dfs] 675 | args = [args] 676 | ix = dfs[0].index.tolist() 677 | for df in dfs[1:]: 678 | assert np.all(df.index == ix) 679 | 680 | if ax is None: 681 | dl, aw, dr = xspace 682 | db, ah, dt = yspace 683 | 684 | fw = dl + aw + dr 685 | fh = db + ah + dt 686 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 687 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 688 | 689 | if xlim is not None: 690 | ax.set_xlim(xlim) 691 | y = np.arange(len(ix)) 692 | ax.set_ylim([y[0]-0.5, y[-1]+0.5]) 693 | 694 | ax.plot([0,0], [-0.5,len(ix)-0.5], '--', color=[0.33]*3, lw=1, zorder=-8) 695 | 696 | n = len(dfs) 697 | d = 0 698 | if n == 2: 699 | # d = [-0.25, 0.25] 700 | # d = [-0.2, 0.2] 701 | d = [-0.15,0.15] 702 | elif n == 3: 703 | d = [-0.25, 0, 0.25] 704 | elif n == 4: 705 | d = [-0.25, -0.15, 0.15, 0.25] 706 | 707 | for k,df in enumerate(dfs): 708 | mean_col = df.columns[0] 709 | ci_cols = df.columns[1:] 710 | delta = (df[ci_cols].T - df[mean_col]).abs() 711 | ax.errorbar(df[mean_col], y+d[k], xerr=delta.values, **args[k]) 712 | 713 | if xlim is None: 714 | xlim = ax.get_xlim() 715 | for i in y: 716 | if i % 2 == 0: 717 | c = [0.95]*3 718 | c = [1]*3 719 | else: 720 | c = [0.75]*3 721 | c = [0.9]*3 722 | patch = patches.Rectangle((xlim[0], i-0.5), np.diff(xlim), 1, fc=c, zorder=-10) 723 | ax.add_patch(patch) 724 | 725 | ax.set_xlabel(xlabel, fontsize=12) 726 | if ylabel is not None: 727 | ax.set_ylabel(ylabel, fontsize=12) 728 | ax.set_yticks(y) 729 | ax.set_yticklabels(ix) 730 | 731 | ax.invert_yaxis() 732 | return ax 733 | 734 | 735 | def _qq_scatter(ax, pval, ntests=None, label=None, c=None, zorder=None, 736 | max_values=100000, step=1000, is_sorted=False, args=None): 737 | """""" 738 | if ntests is None: 739 | ntests = len(pval) 740 | n = len(pval) 741 | if n > max_values: 742 | xi = np.array(list(range(1, max_values+1)) + list(range(max_values+step, n+step, step))) 743 | else: 744 | xi = np.arange(1, n+1) 745 | x = -np.log10(xi/(ntests+1)) 746 | 747 | if not is_sorted: 748 | log_pval_sorted = -np.log10(np.sort(pval)) 749 | else: 750 | log_pval_sorted = -np.log10(pval) 751 | 752 | ax.scatter(x, list(log_pval_sorted[:max_values]) + list(log_pval_sorted[max_values::step]), 753 | c=c, zorder=zorder, label=label, **args) 754 | 755 | 756 | def qqplot(pval, pval_null=None, ntests=None, ntests_null=None, max_values=100000, step=1000, is_sorted=False, 757 | title='', labels=None, fontsize=12, ax=None, equal_axes=False): 758 | """QQ-plot 759 | 760 | ntests: total number of tests if not equal to len(pval), 761 | e.g., if only tail of p-value distribution is provided 762 | """ 763 | if labels is None: 764 | labels = ['', ''] 765 | if ntests is None: 766 | ntests = len(pval) 767 | 768 | if ax is None: 769 | ax = setup_figure(2,2) 770 | ax.margins(x=0.02, y=0.05) 771 | args = {'s':16, 'edgecolor':'none', 'clip_on':False, 'alpha':1, 'rasterized':True} 772 | 773 | # Q-Q plot for pval 774 | _qq_scatter(ax, pval, ntests=ntests, label=labels[0], c=None, zorder=30, 775 | max_values=max_values, step=step, is_sorted=is_sorted, args=args) 776 | 777 | # Q-Q plot for null 778 | if pval_null is not None: 779 | _qq_scatter(ax, pval_null, ntests=ntests_null, label=labels[1], c=[[0.5]*3], zorder=20, 780 | max_values=max_values, step=step, is_sorted=is_sorted, args=args) 781 | 782 | ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=5, nbins=4)) 783 | ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True, min_n_ticks=5, nbins=4)) 784 | 785 | ax.set_xlabel('Expected -log$\mathregular{_{10}}$(p-value)', fontsize=fontsize) 786 | ax.set_ylabel('Observed -log$\mathregular{_{10}}$(p-value)', fontsize=fontsize) 787 | format_plot(ax, fontsize=fontsize-2) 788 | 789 | xlim = ax.get_xlim() 790 | ylim = ax.get_ylim() 791 | if equal_axes: 792 | m = np.maximum(xlim[1], ylim[1]) 793 | ax.set_xlim([0, m]) 794 | ax.set_ylim([0, m]) 795 | else: 796 | ax.set_xlim([0, xlim[1]]) 797 | ax.set_ylim([0, ylim[1]]) 798 | 799 | # plot confidence interval 800 | ci = 0.95 801 | xi = np.linspace(1, ntests, 100000) 802 | x = -np.log10(xi/(ntests+1)) 803 | clower = -np.log10(scipy.stats.beta.ppf((1-ci)/2, xi, xi[::-1])) 804 | cupper = -np.log10(scipy.stats.beta.ppf((1+ci)/2, xi, xi[::-1])) 805 | ax.fill_between(x, cupper, clower, color=[[0.8]*3], clip_on=True, rasterized=True) 806 | b = -np.log10([ntests/(ntests+1), 1/(ntests+1)]) 807 | ax.autoscale(False) 808 | ax.plot(b, b, '--', lw=1, color=[0.2]*3, zorder=50, clip_on=True) 809 | 810 | ax.spines['left'].set_position(('outward', 6)) 811 | ax.spines['bottom'].set_position(('outward', 6)) 812 | ax.set_title(f'{title}', fontsize=12) 813 | if labels[0] != '': 814 | ax.legend(loc='upper left', fontsize=10, handlelength=0.5, handletextpad=0.33) 815 | return ax 816 | 817 | 818 | class CohortLabel(object): 819 | def __init__(self, cohort_s, cmap=None, colors=None, label_pos='left', vmin=None, vmax=None, bad_color=None): 820 | assert cmap is not None or colors is not None 821 | assert not cohort_s.index.duplicated().any() 822 | if cohort_s.dtype == 'O': 823 | cohort_s = cohort_s.astype('category') 824 | self.cohort_s = cohort_s 825 | if cmap is not None and bad_color is not None: 826 | cmap = copy.copy(cmap) 827 | cmap.set_bad(bad_color, 1) 828 | self.cmap = cmap 829 | self.vmin = vmin 830 | self.vmax = vmax 831 | self.name = cohort_s.name 832 | self.label_pos = label_pos 833 | self.colors = colors 834 | 835 | if cohort_s.dtype.name == 'category': 836 | # get numerical index 837 | self.values_s = cohort_s.astype(str).map({j:i for i,j in enumerate(cohort_s.cat.categories)}) 838 | if colors is None: 839 | n = len(cohort_s.cat.categories) 840 | colors = cmap(np.linspace(0, 1, np.maximum(n, 5))) 841 | self.colors = {k:v for k,v in zip(cohort_s.cat.categories, colors)} 842 | self.cmap = ListedColormap(cohort_s.cat.categories.map(self.colors)) 843 | else: 844 | self.values_s = cohort_s 845 | 846 | def plot(self, ix=None, ax=None, show_frame=False): 847 | if ax is None: 848 | ax, cax = setup_figure(2, 0.5, colorbar=True, ch=0.5) 849 | # ax, cax = setup_figure(0.5, 2, colorbar=True, ch=0.5) 850 | 851 | if ix is None: 852 | x = self.values_s.values 853 | else: 854 | x = self.values_s[ix].values 855 | 856 | # detect orientation 857 | bbox = ax.get_window_extent().transformed(ax.get_figure().dpi_scale_trans.inverted()) 858 | width, height = bbox.width, bbox.height 859 | if width > height: 860 | x = x.reshape(1, -1) 861 | else: 862 | x = x.reshape(-1, 1) 863 | h = ax.imshow(x, aspect='auto', cmap=self.cmap, interpolation='none', origin='lower') 864 | if width > height: 865 | if self.label_pos == 'left': 866 | ax.set_ylabel(self.name, fontsize=10, rotation=0, va='center', ha='right') 867 | elif self.label_pos == 'right': 868 | ax.yaxis.set_label_position('right') 869 | ax.set_ylabel(self.name, fontsize=10, rotation=0, va='center', ha='left') 870 | plt.setp(ax.get_yticklabels(), visible=False) 871 | 872 | if not show_frame: 873 | for i in ax.spines: 874 | ax.spines[i].set_visible(False) 875 | plt.setp(ax.get_xticklabels(), visible=False) 876 | plt.setp(ax.get_yticklabels(), visible=False) 877 | 878 | for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines(): 879 | line.set_markersize(0) 880 | line.set_markeredgewidth(0) 881 | 882 | # prepare legend 883 | if self.cohort_s.dtype.name == 'category': 884 | for c in self.cohort_s.cat.categories: 885 | ax.scatter(np.nan, np.nan, c=[self.colors[c]], label=c, s=30, marker='s') 886 | 887 | 888 | def check_labels(labels): 889 | if labels is not None: 890 | if isinstance(labels, CohortLabel): 891 | labels = [labels] 892 | else: 893 | assert all([isinstance(i, CohortLabel) for i in labels]) 894 | n = len(labels) 895 | else: 896 | n = 0 897 | return labels, n 898 | 899 | 900 | def clustermap(df, Zx=None, Zy=None, cluster=True, aw=3, ah=3, lw=1, vmin=None, vmax=None, cmap=plt.cm.Blues, 901 | norm=None, origin='lower', dendrogram_pos='top', col_labels=None, row_labels=None, 902 | fontsize=10, clabel='', cfontsize=10, label_colors=None, colorbar=True, colorbar_orientation='vertical', 903 | method='average', metric='euclidean', optimal_ordering=False, value_labels=False, 904 | show_xlabels=False, show_ylabels=False, tick_length=0, rotation=-45, ha='left', va='top', 905 | tri=False, rasterized=False, count_sort=False, 906 | show_frame=False, dl=1, dr=1, dt=0.2, lh=0.1, ls=0.01, 907 | db=1.5, dd=0.4, ds=0.03, ch=1, cw=0.175, dc=0.1, dtc=0): 908 | """""" 909 | col_labels, nc = check_labels(col_labels) 910 | row_labels, nr = check_labels(row_labels) 911 | 912 | if Zx is None and cluster: 913 | Zx = hierarchy.linkage(df.T, method=method, metric=metric, optimal_ordering=optimal_ordering) 914 | Zy = hierarchy.linkage(df, method=method, metric=metric, optimal_ordering=optimal_ordering) 915 | elif Zy is None and cluster: 916 | Zy = Zx 917 | 918 | fw = dl + aw + dr + nr*(lh+ls) 919 | fh = db + ah + ds + dd + dt + nc*(lh+ls) 920 | fig = plt.figure(figsize=(fw,fh), facecolor='none') 921 | dl2 = dl + nr*(lh+ls) 922 | if dendrogram_pos == 'top': 923 | ax = fig.add_axes([dl2/fw, db/fh, aw/fw, ah/fh]) 924 | # column labels 925 | tax = [] 926 | for k in range(nc): 927 | tax.append( 928 | fig.add_axes([dl2/fw, (db+ah+(k+1)*ls+k*lh)/fh, aw/fw, lh/fh], sharex=ax) 929 | ) 930 | # row labels 931 | lax = [] 932 | for k in range(nr): 933 | lax.append( 934 | fig.add_axes([(dl+k*(lh+ls))/fw, db/fh, lh/fw, ah/fh], sharey=ax) 935 | ) 936 | # dendrogram 937 | dax = fig.add_axes([dl2/fw, (db+ah+nc*(ls+lh)+ds)/fh, aw/fw, dd/fh]) 938 | axes = [ax, *lax, *tax, dax] 939 | # colorbar 940 | if colorbar: 941 | if colorbar_orientation == 'vertical': 942 | cax = fig.add_axes([(dl2+aw+dc)/fw, (db+ah-ch-dtc)/fh, cw/fw, ch/fh], label='Colorbar') 943 | else: 944 | cax = fig.add_axes([(dl2+aw-ch-dtc)/fw, (db-cw-dc)/fh, ch/fw, cw/fh], label='Colorbar') 945 | axes.append(cax) 946 | else: 947 | dax = fig.add_axes([dl/fw, db/fh, aw/fw, dd/fh]) 948 | ax = fig.add_axes([dl/fw, (db+dd+ds)/fh, aw/fw, ah/fh]) 949 | axes = [ax, dax] 950 | if colorbar: 951 | cax = fig.add_axes([(dl+aw+dc)/fw, (db+dd+ds)/fh, cw/fw, ch/fh]) 952 | axes.append(cax) 953 | 954 | if Zx is not None: 955 | with plt.rc_context({'lines.linewidth': lw}): 956 | z = hierarchy.dendrogram(Zx, ax=dax, count_sort=count_sort, orientation='top', link_color_func=lambda k: 'k') 957 | ix = df.columns[z['leaves']] # equivalent to hierarchy.leaves_list(Zx) if count_sort=False 958 | else: 959 | ix = df.columns 960 | dax.axis('off') 961 | 962 | if Zy is not None: 963 | iy = df.index[hierarchy.leaves_list(Zy)] 964 | elif df.index.equals(df.columns): 965 | iy = ix 966 | else: 967 | iy = df.index 968 | 969 | if dendrogram_pos == 'bottom': 970 | dax.invert_yaxis() 971 | 972 | df = df.loc[iy, ix].copy() 973 | if tri: 974 | if dendrogram_pos == 'top': 975 | if origin == 'upper': 976 | df.values[np.tril_indices(df.shape[0], -1)] = np.nan 977 | else: 978 | df.values[np.triu_indices(df.shape[0])] = np.nan 979 | elif dendrogram_pos == 'bottom': 980 | df.values[np.tril_indices(df.shape[0])] = np.nan 981 | 982 | if value_labels: 983 | irange = np.arange(df.shape[0]) 984 | jrange = np.arange(df.shape[1]) 985 | for i in irange: 986 | for j in jrange: 987 | if not np.isnan(df.values[j,i]): 988 | ax.text(i, j, f'{df.values[j,i]:.2f}', ha='center', va='center') 989 | 990 | h = ax.imshow(df.values, origin=origin, cmap=cmap, vmin=vmin, vmax=vmax, norm=norm, 991 | interpolation='none', rasterized=rasterized, aspect='auto') 992 | if show_xlabels: 993 | ax.set_xticks(np.arange(df.shape[1])) 994 | if rotation not in [90, -90]: 995 | ax.set_xticklabels(ix, rotation=rotation, rotation_mode='anchor', ha=ha, va=va, fontsize=fontsize) 996 | else: 997 | ax.set_xticklabels(ix, rotation=rotation, ha=ha, va=va, fontsize=fontsize) 998 | ax.tick_params(axis='x', length=tick_length) 999 | if show_xlabels == 'top': 1000 | ax.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False) 1001 | else: 1002 | ax.set_xticks([]) 1003 | 1004 | if show_ylabels: 1005 | ax.set_yticks(np.arange(df.shape[0])) 1006 | ax.set_yticklabels(iy, fontsize=fontsize) 1007 | ax.tick_params(axis='y', length=tick_length) 1008 | else: 1009 | ax.set_yticks([]) 1010 | 1011 | # plot labels 1012 | for k in range(nr): 1013 | row_labels[k].plot(ax=lax[k], ix=iy, show_frame=True) 1014 | for k in range(nc): 1015 | col_labels[k].plot(ax=tax[k], ix=ix, show_frame=True) 1016 | 1017 | if lax: 1018 | plt.setp(ax.get_yticklabels(), visible=False) 1019 | for line in ax.yaxis.get_ticklines(): 1020 | line.set_markersize(0) 1021 | line.set_markeredgewidth(0) 1022 | 1023 | if dendrogram_pos == 'bottom': 1024 | ax.yaxis.tick_right() 1025 | # else: 1026 | # ax.xaxis.tick_top() 1027 | 1028 | if label_colors is not None: # plot color legend 1029 | s = 1.015 1030 | xlim = ax.get_xlim() 1031 | b = xlim[1] - s*np.diff(xlim) 1032 | ax.set_xlim(xlim) 1033 | ax.scatter([b]*df.shape[1], np.arange(df.shape[1]), s=40, c=label_colors.iloc[hierarchy.leaves_list(Zx)], clip_on=False) 1034 | ax.set_yticks(np.arange(df.shape[0])) 1035 | ax.set_yticklabels(iy, fontsize=fontsize) 1036 | ax.tick_params(axis='y', pad=12, length=0) 1037 | 1038 | # s = 1.02 1039 | # ylim = ax.get_ylim() 1040 | # b = ylim[1] - s*np.diff(ylim) 1041 | # ax.set_ylim(ylim) 1042 | # ax.scatter(np.arange(df.shape[1]), [b]*df.shape[1], s=36, c=label_colors[hierarchy.leaves_list(Zx)], clip_on=False) 1043 | # ax.tick_params(axis='x', pad=12) 1044 | 1045 | # plot colorbar 1046 | if colorbar: 1047 | cbar = plt.colorbar(h, cax=cax, orientation=colorbar_orientation) 1048 | if norm is None: 1049 | cax.locator_params(nbins=4) 1050 | cbar.set_label(clabel, fontsize=cfontsize+1) 1051 | cax.tick_params(labelsize=cfontsize) 1052 | 1053 | if not show_frame: 1054 | for i in ['left', 'top', 'right', 'bottom']: 1055 | ax.spines[i].set_visible(False) 1056 | 1057 | plt.sca(ax) 1058 | return axes 1059 | 1060 | 1061 | def hexdensity(x, y, bounds=None, bins='log', scale='log', 1062 | cmap=None, vmin=None, vmax=None, ax=None, cax=None, 1063 | unit='TPM', entity='genes', 1064 | gridsize=175, fontsize=12, show_corr=True, clip_on=True, rasterized=False): 1065 | """Wrapper for hexbin""" 1066 | 1067 | if ax is None: # setup new axes 1068 | ax, cax = setup_figure(2, 2, xspace=[0.75, 1], yspace=[0.75, 0.5], colorbar=True, ch=1, cw=0.12) 1069 | ax.margins(0.01) 1070 | format_plot(ax, fontsize=fontsize-2, x_offset=6, y_offset=6) 1071 | 1072 | if cmap is None: 1073 | cmap = copy.copy(plt.cm.RdYlBu_r) 1074 | cmap.set_bad('w', 1.) 1075 | 1076 | rho = scipy.stats.spearmanr(x, y)[0] 1077 | x = x.copy() 1078 | y = y.copy() 1079 | nanidx = (x == 0) | (y == 0) 1080 | if any(nanidx): 1081 | x[nanidx] = np.nan 1082 | y[nanidx] = np.nan 1083 | 1084 | h = ax.hexbin(x, y, bins=bins, xscale=scale, yscale=scale, linewidths=0.1, 1085 | gridsize=gridsize, cmap=cmap, vmin=vmin, vmax=vmax, mincnt=1, zorder=1, 1086 | clip_on=clip_on, rasterized=rasterized) 1087 | 1088 | if bounds is None: 1089 | xlim = ax.get_xlim() 1090 | ylim = ax.get_ylim() 1091 | bounds = [np.minimum(xlim[0], ylim[0]), np.maximum(xlim[1], ylim[1])] 1092 | elif len(bounds) == 2: 1093 | ax.set_xlim(bounds) 1094 | ax.set_ylim(bounds) 1095 | else: 1096 | ax.set_xlim(bounds[:2]) 1097 | ax.set_ylim(bounds[2:]) 1098 | 1099 | if show_corr: 1100 | t = ax.text(1, 0, r'$\rho$ = {:.2f}'.format(rho), transform=ax.transAxes, 1101 | ha='right', va='bottom', fontsize=fontsize, zorder=2) 1102 | t.set_bbox(dict(facecolor='w', alpha=0.5, edgecolor='none', boxstyle="round,pad=0.1")) 1103 | 1104 | if cax is not None: 1105 | hc = plt.colorbar(h, cax=cax, orientation='vertical', ticks=ticker.LogLocator(numticks=4)) 1106 | hc.set_label('log$\mathregular{_{10}}$('+entity+')', fontsize=fontsize) 1107 | 1108 | if isinstance(x, pd.Series): 1109 | ax.set_xlabel(f'{x.name} ({unit})' if unit is not None else f'{x.name}', fontsize=fontsize) 1110 | if isinstance(y, pd.Series): 1111 | ax.set_ylabel(f'{y.name} ({unit})' if unit is not None else f'{y.name}', fontsize=fontsize) 1112 | 1113 | return ax, cax 1114 | -------------------------------------------------------------------------------- /qtl/sam.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import subprocess 4 | import io 5 | 6 | 7 | 8 | 9 | def is_stranded(bam_file, paired_end=True, verbose=False): 10 | """ 11 | Determine whether the sequencing protocol was strand-specific 12 | based on reads mapping to ACTB(-) and FTL(+) 13 | """ 14 | 15 | header = subprocess.check_output(f'samtools view -H {bam_file}', shell=True).decode() 16 | header = header.strip().split('\n') 17 | header = [i.split('\t') for i in header if i.startswith('@SQ')] 18 | c = [i[1].split(':')[1] for i in header] 19 | if not np.any([i.startswith('chr') for i in c]): # assume hg19 20 | plus_str = '19:49468558-49470135' # FTL 21 | minus_str = '7:5566782-5603415' # ACTB 22 | else: # hg38 23 | plus_str = 'chr19:48965301-48966878' # FTL 24 | minus_str = 'chr7:5527151-5563784' # ACTB 25 | 26 | if paired_end: 27 | cmd = f'samtools view -q 255 -f2 -F3840 {bam_file} {plus_str} | cut -f2 | sort | uniq -c' 28 | with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p: 29 | s, stderr = p.communicate() 30 | if stderr == '[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n': 31 | raise ValueError('BAM/CRAM file must be indexed.') 32 | dfp = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze() 33 | 34 | cmd = f'samtools view -q 255 -f2 -F3840 {bam_file} {minus_str} | cut -f2 | sort | uniq -c' 35 | s = subprocess.check_output(cmd, shell=True).decode() 36 | dfm = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze() 37 | 38 | dfp = dfp.reindex([147, 99, 83, 163], fill_value=0) 39 | dfm = dfm.reindex([147, 99, 83, 163], fill_value=0) 40 | 41 | p = dfp[[147, 99]].sum() / dfp.sum() 42 | m = dfm[[163, 83]].sum() / dfm.sum() 43 | else: 44 | s = subprocess.check_output(f'samtools view -q 255 {bam_file} {plus_str} | cut -f2 | sort | uniq -c', shell=True).decode() 45 | dfp = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze() 46 | s = subprocess.check_output(f'samtools view -q 255 {bam_file} {minus_str} | cut -f2 | sort | uniq -c', shell=True).decode() 47 | dfm = pd.read_csv(io.StringIO(s), sep='\s+', header=None, names=['count', 'flag']).set_index('flag').squeeze() 48 | dfp = dfp.reindex([0, 16], fill_value=0) 49 | dfm = dfm.reindex([0, 16], fill_value=0) 50 | p = dfp[0].sum() / dfp.sum() 51 | m = dfm[16].sum() / dfm.sum() 52 | 53 | is_stranded = (p<0.02) & (m<0.02) 54 | if verbose: 55 | print(f'Total read coverage: {dfp.sum()} (FTL), {dfm.sum()} (ACTB)') 56 | print(f'Proportion of FTL(+) reads on -strand: {p:.4g}') 57 | print(f'Proportion of ACTB(-) reads on +strand: {m:.4g}') 58 | print(f'Stranded: {is_stranded}') 59 | 60 | return is_stranded 61 | -------------------------------------------------------------------------------- /qtl/stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as stats 4 | 5 | 6 | class Residualizer(object): 7 | def __init__(self, C, fail_colinear=False): 8 | # center and orthogonalize 9 | self.Q, R = np.linalg.qr(C - np.mean(C,0)) 10 | self.dof = C.shape[0] - 2 - C.shape[1] 11 | 12 | # check for colinearity 13 | colinear_ix = np.abs(np.diag(R)) < np.finfo(np.float64).eps * C.shape[1] 14 | if np.any(colinear_ix): 15 | if fail_colinear: 16 | raise ValueError("Colinear or zero covariates detected") 17 | else: # drop colinear covariates 18 | print(f' * dropped colinear covariates: {np.sum(colinear_ix)}') 19 | self.Q = self.Q[:, ~colinear_ix] 20 | 21 | def transform(self, df, center=False): 22 | """Residualize rows of df wrt columns of C""" 23 | # transform input 24 | if isinstance(df, pd.DataFrame) or isinstance(df, pd.Series): 25 | M = df.values 26 | else: 27 | M = df 28 | 29 | isvector = False 30 | if isinstance(M, list) or (hasattr(M, 'shape') and len(M.shape)==1): 31 | M = np.array(M).reshape(1,-1) 32 | isvector = True 33 | 34 | # residualize M relative to C 35 | M0 = M - np.mean(M, axis=1, keepdims=True) 36 | if center: 37 | M0 = M0 - np.dot(np.dot(M0, self.Q), self.Q.T) 38 | else: 39 | M0 = M - np.dot(np.dot(M0, self.Q), self.Q.T) # retain original mean 40 | 41 | if isvector: 42 | M0 = M0[0] 43 | 44 | if isinstance(df, pd.DataFrame): 45 | M0 = pd.DataFrame(M0, index=df.index, columns=df.columns) 46 | elif isinstance(df, pd.Series): 47 | M0 = pd.Series(M0, index=df.index, name=df.name) 48 | 49 | return M0 50 | 51 | 52 | def residualize(df, C, center=False, fail_colinear=False): 53 | r = Residualizer(C, fail_colinear=fail_colinear) 54 | return r.transform(df, center=center) 55 | 56 | 57 | def center_normalize(x, axis=0): 58 | """Center and normalize x""" 59 | if isinstance(x, pd.DataFrame): 60 | x0 = x - np.mean(x.values, axis=axis, keepdims=True) 61 | return x0 / np.sqrt(np.sum(x0.pow(2).values, axis=axis, keepdims=True)) 62 | elif isinstance(x, pd.Series): 63 | x0 = x - x.mean() 64 | return x0 / np.sqrt(np.sum(x0*x0)) 65 | elif isinstance(x, np.ndarray): 66 | x0 = x - np.mean(x, axis=axis, keepdims=True) 67 | return x0 / np.sqrt(np.sum(x0*x0, axis=axis)) 68 | 69 | 70 | def padjust_bh(p): 71 | """ 72 | Benjamini-Hochberg adjusted p-values 73 | 74 | Replicates p.adjust(p, method="BH") from R 75 | """ 76 | n = len(p) 77 | i = np.arange(n,0,-1) 78 | o = np.argsort(p)[::-1] 79 | ro = np.argsort(o) 80 | pa = np.minimum(1, np.minimum.accumulate(np.float64(n)/i * np.array(p)[o]))[ro] 81 | if isinstance(p, pd.Series): 82 | pa = pd.Series(pa, index=p.index) 83 | return pa 84 | 85 | 86 | def pi0est(p, lambda_param): 87 | """ 88 | pi0 statistic (Storey and Tibshirani, 2003) 89 | 90 | For fixed values of 'lambda'; equivalent to the qvalue::pi0est 91 | from R package qvalue 92 | """ 93 | if np.min(p) < 0 or np.max(p) > 1: 94 | raise ValueError("p-values not in valid range [0, 1]") 95 | elif np.min(lambda_param) < 0 or np.max(lambda_param) >= 1: 96 | raise ValueError("lambda must be within [0, 1)") 97 | 98 | pi0 = np.mean(p >= lambda_param) / (1 - lambda_param) 99 | pi0 = np.minimum(pi0, 1) 100 | 101 | if pi0 <= 0: 102 | raise ValueError("The estimated pi0 <= 0. Check that you have valid p-values or use a different range of lambda.") 103 | 104 | return pi0 105 | 106 | 107 | def qvalue(p, lambda_param=0.5, pi0=None): 108 | """ 109 | q-value calculation for fixed 'lambda' from Storey and Tibshirani, 2003. 110 | """ 111 | if isinstance(p, pd.Series): 112 | ix = p.index 113 | p = p.values 114 | else: 115 | ix = None 116 | 117 | if pi0 is None: 118 | pi0 = pi0est(p, lambda_param) 119 | 120 | u = np.argsort(p) 121 | m = len(p) 122 | v = stats.rankdata(p, method='max') # sort p 123 | qvals = (pi0 * m * p)/v 124 | 125 | qvals[u[m-1]] = np.minimum(qvals[u[m-1]], 1) 126 | for i in range(m-2, -1, -1): 127 | qvals[u[i]] = np.minimum(qvals[u[i]], qvals[u[i+1]]) 128 | 129 | if ix is not None: 130 | qvals = pd.Series(qvals, index=ix) 131 | return qvals 132 | 133 | 134 | def bootstrap_pi1(pval, lambda_param=0.5, bounds=[2.5, 97.5], n=1000): 135 | """Compute confidence intervals for pi1 with bootstrapping""" 136 | pi1_boot = [] 137 | nfail = 0 138 | for _ in range(n): 139 | try: 140 | pi1_boot.append(1 - pi0est(np.random.choice(pval, len(pval), replace=True), lambda_param=lambda_param)) 141 | except: 142 | nfail += 1 143 | if nfail > 0: 144 | print(f'Warning: {nfail} bootstraps failed') 145 | pi1_boot = np.array(pi1_boot) 146 | if len(pi1_boot) > 0: 147 | ci = np.percentile(pi1_boot, bounds) 148 | else: 149 | ci = np.array([np.nan, np.nan]) 150 | return ci 151 | -------------------------------------------------------------------------------- /qtl/torus.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as stats 4 | import os 5 | import matplotlib.pyplot as plt 6 | import matplotlib.patches as patches 7 | 8 | from . import plot 9 | 10 | torus_dict = { 11 | 'TF_BINDING_SITE': 'TF binding site', 12 | 'CTCF_BINDING_SITE': 'CTCF binding site', 13 | 'INTRON_VARIANT': 'Intron variant', 14 | 'SYNONYMOUS_VARIANT': 'Synonymous variant', 15 | 'SPLICE_DONOR_VARIANT': 'Splice donor variant', 16 | 'NON_CODING_TRANSCRIPT_EXON_VARIANT': 'Non-coding transcript exon variant', 17 | 'MISSENSE_VARIANT': 'Missense variant', 18 | 'STOP_GAINED': 'Stop gained', 19 | '3_PRIME_UTR_VARIANT': "3' UTR variant", 20 | 'FRAMESHIFT_VARIANT': 'Frameshift variant', 21 | 'OPEN_CHROMATIN_REGION': 'Open chromatin region', 22 | 'SPLICE_REGION_VARIANT': 'Splice region variant', 23 | '5_PRIME_UTR_VARIANT': "5' UTR variant", 24 | 'PROMOTER': 'Promoter', 25 | 'ENHANCER': 'Enhancer', 26 | 'PROMOTER_FLANKING_REGION': 'Promoter-flanking region', 27 | 'SPLICE_ACCEPTOR_VARIANT': 'Splice acceptor variant', 28 | } 29 | 30 | torus_short_dict = {i:i.replace(' variant', '') for i in torus_dict.values()} 31 | torus_short_dict['Open chromatin region'] = 'Open chromatin' 32 | torus_short_dict['Promoter-flanking region'] = 'Promoter-flanking' 33 | torus_short_dict['Non-coding transcript exon variant'] = 'NC transcript' 34 | 35 | # enhancer_d: Enhancer 36 | # promoter_d: Promoter 37 | # open_chromatin_region_d: Open chromatin 38 | # promoter_flanking_region_d: Promoter-flanking 39 | # CTCF_binding_site_d: CTCF binding site 40 | # TF_binding_site_d: TF binding site 41 | # 3_prime_UTR_variant_d: 3' UTR 42 | # 5_prime_UTR_variant_d: 5' UTR 43 | # frameshift_variant_d: Frameshift 44 | # intron_variant_d: Intron 45 | # missense_variant_d: Missense 46 | # non_coding_transcript_exon_variant_d: NC transcript 47 | # splice_acceptor_variant_d: Splice acceptor 48 | # splice_donor_variant_d: Splice donor 49 | # splice_region_variant_d: Splice region 50 | # stop_gained_d: Stop gained 51 | # synonymous_variant_d: Synonymous 52 | 53 | 54 | def convert_torus(tensorqtl_files, out_file, phenotype_groups_file=None, mode='xQTL'): 55 | """Convert tensorQTL parquet files to Torus input format""" 56 | if os.path.exists(out_file): 57 | raise ValueError('Output file already exists') 58 | assert mode in ['xQTL', 'ixQTL'] 59 | 60 | if phenotype_groups_file is not None: 61 | group_s = pd.read_csv(phenotype_groups_file, sep='\t', index_col=0, header=None, squeeze=True) 62 | group_size_s = group_s.value_counts() 63 | 64 | if mode=='xQTL': 65 | cols = ['phenotype_id', 'variant_id', 'tss_distance', 'pval_nominal', 'slope', 'slope_se'] 66 | elif mode=='ixQTL': 67 | cols = ['phenotype_id', 'variant_id', 'tss_distance', 'pval_gi', 'b_gi', 'b_gi_se'] 68 | 69 | for f in tensorqtl_files: 70 | print(f) 71 | df = pd.read_parquet(f, columns=cols) 72 | df['phenotype_id'] = df['phenotype_id'].apply(lambda x: x.rsplit(':',1)[-1]) 73 | if phenotype_groups_file is not None: 74 | print(' * adjusting p-values by phenotype group size') 75 | if mode=='xQTL': 76 | df['pval_nominal'] = np.minimum(df['pval_nominal']*df['phenotype_id'].map(group_size_s), 1.0) 77 | elif mode=='ixQTL': 78 | df['pval_gi'] = np.minimum(df['pval_gi']*df['phenotype_id'].map(group_size_s), 1.0) 79 | df.to_csv(out_file, sep=' ', float_format='%.6g', compression='gzip', mode='a', index=False, header=None) 80 | 81 | 82 | def load(torus_output, log2=True, short_labels=True): 83 | torus_df = pd.read_csv(torus_output, sep='\s+', index_col=0, header=None) 84 | torus_df.columns = ['mean', 'CI5', 'CI95'] 85 | torus_df.index.name = 'feature' 86 | torus_df.drop('Intercept', axis=0, inplace=True) 87 | torus_df = torus_df[~torus_df.index.str.startswith('dtss')] 88 | torus_df.index = torus_df.index.map(lambda x: torus_dict.get(x.strip('.1').upper(), x.strip('.1').upper())) 89 | if short_labels: 90 | torus_df.index = torus_df.index.map(lambda x: torus_short_dict.get(x, x)) 91 | if log2: 92 | torus_df *= np.log2(np.e) 93 | return torus_df 94 | 95 | 96 | def load_summary(summary_file, log2=True): 97 | """Load aggregated output""" 98 | 99 | torus_df = pd.read_csv(summary_file, sep='\t', index_col=0) 100 | torus_df = torus_df[~torus_df.index.str.startswith('dtss')] 101 | torus_df.drop('Intercept', inplace=True) 102 | torus_df.index = torus_df.index.map(lambda x: torus_dict[x.strip('.1').upper()]) 103 | 104 | lor_df = torus_df[torus_df.columns[torus_df.columns.str.endswith('lor')]].copy() 105 | lor_df.columns = lor_df.columns.str.replace('.lor','') 106 | 107 | if log2: 108 | lor_df = np.log2(np.exp(lor_df)) 109 | 110 | return lor_df 111 | 112 | 113 | def test_significance(torus_df1, torus_df2): 114 | assert np.all(torus_df1.index==torus_df2.index) 115 | se = (torus_df1['CI95']-torus_df1['CI5'] + torus_df2['CI95']-torus_df2['CI5']) / 3.919927969080108 116 | mu = torus_df1['mean'] - torus_df2['mean'] 117 | zstat = mu / se 118 | pval = 2*stats.norm.sf(np.abs(zstat)) 119 | m = pval<0.05/torus_df1.shape[0] 120 | return pd.DataFrame([pval, m], index=['pval', 'signif_bonferroni'], columns=torus_df1.index).T 121 | --------------------------------------------------------------------------------