├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docs └── outputs.md ├── example ├── GTEx_v8_example.ipynb ├── data │ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen │ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam │ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pvar │ ├── GEUVADIS.445_samples.covariates.txt │ └── GEUVADIS.445_samples.expression.bed.gz └── tensorqtl_examples.ipynb ├── install ├── INSTALL.md ├── install_cuda.sh └── tensorqtl_env.yml ├── pyproject.toml └── tensorqtl ├── __init__.py ├── __main__.py ├── cis.py ├── coloc.py ├── core.py ├── eigenmt.py ├── genotypeio.py ├── mixqtl.py ├── pgen.py ├── post.py ├── rfunc.py ├── susie.py ├── tensorqtl.py └── trans.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.egg-info/ 3 | *.ipynb_checkpoints/ 4 | build/ 5 | dist/ 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for tensorQTL 2 | # https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/unsupported-tags.md 3 | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 4 | MAINTAINER Francois Aguet 5 | 6 | RUN apt-get update && apt-get install -y software-properties-common && \ 7 | apt-get update && apt-get install -y \ 8 | apt-transport-https \ 9 | build-essential \ 10 | cmake \ 11 | curl \ 12 | libboost-all-dev \ 13 | libbz2-dev \ 14 | libcurl3-dev \ 15 | liblzma-dev \ 16 | libncurses5-dev \ 17 | libssl-dev \ 18 | python3 \ 19 | python3-pip \ 20 | sudo \ 21 | unzip \ 22 | wget \ 23 | zlib1g-dev \ 24 | && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ 25 | apt-get clean && \ 26 | apt-get autoremove -y && \ 27 | rm -rf /var/lib/{apt,dpkg,cache,log}/ 28 | 29 | # htslib 30 | RUN cd /opt && \ 31 | wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19/htslib-1.19.tar.bz2 && \ 32 | tar -xf htslib-1.19.tar.bz2 && rm htslib-1.19.tar.bz2 && cd htslib-1.19 && \ 33 | ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \ 34 | make && make install && make clean 35 | 36 | # bcftools 37 | RUN cd /opt && \ 38 | wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \ 39 | tar -xf bcftools-1.19.tar.bz2 && rm bcftools-1.19.tar.bz2 && cd bcftools-1.19 && \ 40 | ./configure --with-htslib=system && make && make install && make clean 41 | 42 | # install R 43 | ENV DEBIAN_FRONTEND noninteractive 44 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 45 | RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' 46 | RUN apt update && apt install -y r-base r-base-dev 47 | ENV R_LIBS_USER=/opt/R/4.0 48 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) {install.packages("BiocManager")}; BiocManager::install("qvalue");' 49 | 50 | # python modules 51 | RUN pip3 install --upgrade pip setuptools 52 | RUN pip3 install numpy pandas scipy 53 | RUN pip3 install pandas-plink ipython jupyter matplotlib pyarrow torch rpy2 gcsfs Pgenlib>=0.90.1 54 | RUN pip3 install tensorqtl==1.0.9 55 | 56 | # RUN cd /opt && \ 57 | # wget https://github.com/broadinstitute/tensorqtl/archive/v1.0.8.tar.gz && \ 58 | # tar -xf v1.0.8.tar.gz && mv tensorqtl-1.0.8 tensorqtl && \ 59 | # rm v1.0.8.tar.gz 60 | # RUN pip3 install /opt/tensorqtl/ 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018-2019, The Broad Institute, Inc. and The General Hospital Corporation. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## tensorQTL 2 | 3 | tensorQTL is a GPU-enabled QTL mapper, achieving ~200-300 fold faster *cis*- and *trans*-QTL mapping compared to CPU-based implementations. 4 | 5 | If you use tensorQTL in your research, please cite the following paper: 6 | [Taylor-Weiner, Aguet, et al., *Genome Biol.*, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7).
7 | Empirical beta-approximated p-values are computed as described in [Ongen et al., *Bioinformatics*, 2016](https://academic.oup.com/bioinformatics/article/32/10/1479/1742545). 8 | 9 | ### Install 10 | You can install tensorQTL using pip: 11 | ``` 12 | pip3 install tensorqtl 13 | ``` 14 | or directly from this repository: 15 | ``` 16 | $ git clone git@github.com:broadinstitute/tensorqtl.git 17 | $ cd tensorqtl 18 | # install into a new virtual environment and load 19 | $ mamba env create -f install/tensorqtl_env.yml 20 | $ conda activate tensorqtl 21 | ``` 22 | To install the latest version from this repository, run 23 | ``` 24 | pip install pip@git+https://github.com/broadinstitute/tensorqtl.git 25 | ``` 26 | 27 | To use PLINK 2 binary files ([pgen/pvar/psam](https://www.cog-genomics.org/plink/2.0/input#pgen)), [pgenlib](https://github.com/chrchang/plink-ng/tree/master/2.0/Python) must be installed using either 28 | ``` 29 | pip install Pgenlib 30 | ``` 31 | (this is included in `tensorqtl_env.yml` above), or from the source : 32 | ``` 33 | git clone git@github.com:chrchang/plink-ng.git 34 | cd plink-ng/2.0/Python/ 35 | python3 setup.py build_ext 36 | python3 setup.py install 37 | ``` 38 | 39 | ### Requirements 40 | 41 | tensorQTL requires an environment configured with a GPU for optimal performance, but can also be run on a CPU. Instructions for setting up a virtual machine on Google Cloud Platform are provided [here](install/INSTALL.md). 42 | 43 | ### Input formats 44 | Three inputs are required for QTL analyses with tensorQTL: genotypes, phenotypes, and covariates. 45 | * Phenotypes must be provided in BED format, with a single header line starting with `#` and the first four columns corresponding to: `chr`, `start`, `end`, `phenotype_id`, with the remaining columns corresponding to samples (the identifiers must match those in the genotype input). In addition to .bed/.bed.gz, BED input in .parquet is also supported. The BED file can specify the center of the *cis*-window (usually the TSS), with `start == end-1`, or alternatively, start and end positions, in which case the *cis*-window is [start-window, end+window]. A function for generating a BED template from a gene annotation in GTF format is available in [pyqtl](https://github.com/broadinstitute/pyqtl) (`io.gtf_to_tss_bed`). 46 | * Covariates can be provided as a tab-delimited text file (covariates x samples) or dataframe (samples x covariates), with row and column headers. 47 | * Genotypes should preferrably be in [PLINK2](https://www.cog-genomics.org/plink/2.0/) pgen/pvar/psam format, which can be generated from a VCF as follows: 48 | ``` 49 | plink2 \ 50 | --output-chr chrM \ 51 | --vcf ${plink_prefix_path}.vcf.gz \ 52 | --out ${plink_prefix_path} 53 | ``` 54 | If using `--make-bed` with PLINK 1.9 or earlier, add the `--keep-allele-order` flag. 55 | 56 | Alternatively, the genotypes can be provided in bed/bim/fam format, or as a parquet dataframe (genotypes x samples). 57 | 58 | 59 | The [examples notebook](example/tensorqtl_examples.ipynb) below contains examples of all input files. The input formats for phenotypes and covariates are identical to those used by [FastQTL](https://github.com/francois-a/fastqtl). 60 | 61 | ### Examples 62 | For examples illustrating *cis*- and *trans*-QTL mapping, please see [tensorqtl_examples.ipynb](example/tensorqtl_examples.ipynb). 63 | 64 | ### Running tensorQTL 65 | This section describes how to run the different modes of tensorQTL, both from the command line and within Python. 66 | For a full list of options, run 67 | ``` 68 | python3 -m tensorqtl --help 69 | ``` 70 | 71 | #### Loading input files 72 | This section is only relevant when running tensorQTL in Python. 73 | The following imports are required: 74 | ``` 75 | import pandas as pd 76 | import tensorqtl 77 | from tensorqtl import genotypeio, cis, trans 78 | ``` 79 | Phenotypes and covariates can be loaded as follows: 80 | ``` 81 | phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(phenotype_bed_file) 82 | covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T # samples x covariates 83 | ``` 84 | Genotypes can be loaded as follows, where `plink_prefix_path` is the path to the VCF in PLINK format (excluding `.bed`/`.bim`/`.fam` extensions): 85 | ``` 86 | pr = genotypeio.PlinkReader(plink_prefix_path) 87 | # load genotypes and variants into data frames 88 | genotype_df = pr.load_genotypes() 89 | variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] 90 | ``` 91 | To save memory when using genotypes for a subset of samples, a subset of samples can be loaded (this is not strictly necessary, since tensorQTL will select the relevant samples from `genotype_df` otherwise): 92 | ``` 93 | pr = genotypeio.PlinkReader(plink_prefix_path, select_samples=phenotype_df.columns) 94 | ``` 95 | 96 | #### *cis*-QTL mapping: permutations 97 | This is the main mode for *cis*-QTL mapping. It generates phenotype-level summary statistics with empirical p-values, enabling calculation of genome-wide FDR. 98 | In Python: 99 | ``` 100 | cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df) 101 | tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85) 102 | ``` 103 | Shell command: 104 | ``` 105 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \ 106 | --covariates ${covariates_file} \ 107 | --mode cis 108 | ``` 109 | `${prefix}` specifies the output file name. 110 | 111 | #### *cis*-QTL mapping: summary statistics for all variant-phenotype pairs 112 | In Python: 113 | ``` 114 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 115 | prefix, covariates_df, output_dir='.') 116 | ``` 117 | Shell command: 118 | ``` 119 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \ 120 | --covariates ${covariates_file} \ 121 | --mode cis_nominal 122 | ``` 123 | The results are written to a [parquet](https://parquet.apache.org/) file for each chromosome. These files can be read using `pandas`: 124 | ``` 125 | df = pd.read_parquet(file_name) 126 | ``` 127 | #### *cis*-QTL mapping: conditionally independent QTLs 128 | This mode maps conditionally independent *cis*-QTLs using the stepwise regression procedure described in [GTEx Consortium, 2017](https://www.nature.com/articles/nature24277). The output from the permutation step (see `map_cis` above) is required. 129 | In Python: 130 | ``` 131 | indep_df = cis.map_independent(genotype_df, variant_df, cis_df, 132 | phenotype_df, phenotype_pos_df, covariates_df) 133 | ``` 134 | Shell command: 135 | ``` 136 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \ 137 | --covariates ${covariates_file} \ 138 | --cis_output ${prefix}.cis_qtl.txt.gz \ 139 | --mode cis_independent 140 | ``` 141 | 142 | #### *cis*-QTL mapping: interactions 143 | Instead of mapping the standard linear model (p ~ g), this mode includes an interaction term (p ~ g + i + gi) and returns full summary statistics for the model. The interaction term is a tab-delimited text file or dataframe mapping sample ID to interaction value(s) (if multiple interactions are used, the file must include a header with variable names). With the `run_eigenmt=True` option, [eigenMT](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)-adjusted p-values are computed. 144 | In Python: 145 | ``` 146 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix, 147 | covariates_df=covariates_df, 148 | interaction_df=interaction_df, maf_threshold_interaction=0.05, 149 | run_eigenmt=True, output_dir='.', write_top=True, write_stats=True) 150 | ``` 151 | The input options `write_top` and `write_stats` control whether the top association per phenotype and full summary statistics, respectively, are written to file. 152 | 153 | Shell command: 154 | ``` 155 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \ 156 | --covariates ${covariates_file} \ 157 | --interaction ${interactions_file} \ 158 | --best_only \ 159 | --mode cis_nominal 160 | ``` 161 | The option `--best_only` disables output of full summary statistics. 162 | 163 | Full summary statistics are saved as [parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html) files for each chromosome, in `${output_dir}/${prefix}.cis_qtl_pairs.${chr}.parquet`, and the top association for each phenotype is saved to `${output_dir}/${prefix}.cis_qtl_top_assoc.txt.gz`. In these files, the columns `b_g`, `b_g_se`, `pval_g` are the effect size, standard error, and p-value of *g* in the model, with matching columns for *i* and *gi*. In the `*.cis_qtl_top_assoc.txt.gz` file, `tests_emt` is the effective number of independent variants in the cis-window estimated with eigenMT, i.e., based on the eigenvalue decomposition of the regularized genotype correlation matrix ([Davis et al., AJHG, 2016](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)). `pval_emt = pval_gi * tests_emt`, and `pval_adj_bh` are the Benjamini-Hochberg adjusted p-values corresponding to `pval_emt`. 164 | 165 | #### *trans*-QTL mapping 166 | This mode computes nominal associations between all phenotypes and genotypes. tensorQTL generates sparse output by default (associations with p-value < 1e-5). *cis*-associations are filtered out. The output is in parquet format, with four columns: phenotype_id, variant_id, pval, maf. 167 | In Python: 168 | ``` 169 | trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, 170 | return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05, 171 | batch_size=20000) 172 | # remove cis-associations 173 | trans_df = trans.filter_cis(trans_df, phenotype_pos_df.T.to_dict(), variant_df, window=5000000) 174 | ``` 175 | Shell command: 176 | ``` 177 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \ 178 | --covariates ${covariates_file} \ 179 | --mode trans 180 | ``` 181 | 182 | -------------------------------------------------------------------------------- /docs/outputs.md: -------------------------------------------------------------------------------- 1 | ### Output files 2 | #### Mode `cis_nominal` 3 | Column | Description 4 | --- | --- 5 | `phenotype_id` | Phenotype ID 6 | `variant_id` | Variant ID 7 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS) 8 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position) 9 | `af` | In-sample ALT allele frequency of the variant 10 | `ma_samples` | Number of samples carrying at least on minor allele 11 | `ma_count` | Number of minor alleles 12 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant 13 | `slope` | Regression slope 14 | `slope_se` | Standard error of the regression slope 15 | 16 | #### Mode `cis_nominal`, with interaction term 17 | When an interaction term is included, the output additionally contains the following columns instead of `pval_nominal`, `slope`, `slope_se`: 18 | Column | Description 19 | --- | --- 20 | `pval_g` | Nominal p-value of the genotype term 21 | `b_g` | Slope of the genotype term 22 | `b_g_se` | Standard error of `b_g` 23 | `pval_i` | Nominal p-value of the interaction variable 24 | `b_i` | Slope of the interaction variable 25 | `b_i_se` | Standard error of `b_i` 26 | `pval_gi` | Nominal p-value of the interaction term 27 | `b_gi` | Slope of the interaction term 28 | `b_gi_se` | Standard error of `b_gi` 29 | `tests_emt` | Effective number of independent variants (Meff) estimated by eigenMT 30 | `pval_emt` | Bonferroni-adjusted `pval_gi` (i.e., multiplied by Meff) 31 | `pval_adj_bh` | Benjamini-Hochberg adjusted `pval_emt` 32 | 33 | #### Mode `cis` 34 | Column | Description 35 | --- | --- 36 | `phenotype_id` | Phenotype ID 37 | `num_var` | Number of variants in *cis*-window 38 | `beta_shape1` | Parameter of the fitted Beta distribution 39 | `beta_shape2` | Parameter of the fitted Beta distribution 40 | `true_df` | Degrees of freedom used to compute p-values 41 | `pval_true_df` | Nominal p-value based on `true_df` 42 | `variant_id` | Variant ID 43 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS) 44 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position) 45 | `ma_samples` | Number of samples carrying at least on minor allele 46 | `ma_count` | Number of minor alleles 47 | `af` | In-sample ALT allele frequency of the variant 48 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant 49 | `slope` | Regression slope 50 | `slope_se` | Standard error of the regression slope 51 | `pval_perm` | Empirical p-value from permutations 52 | `pval_beta` | Beta-approximated empirical p-value 53 | `qval` | Storey q-value corresponding to `pval_beta` 54 | `pval_nominal_threshold` | Nominal p-value threshold for significant associations with the phenotype 55 | 56 | #### Mode `cis_independent` 57 | The columns are the same as for `cis`, excluding `qval` and `pval_nominal_threshold`, and adding: 58 | Column | Description 59 | --- | --- 60 | `rank` | Rank of the variant for the phenotype 61 | 62 | #### Mode `trans` 63 | Column | Description 64 | --- | --- 65 | `variant_id` | Variant ID 66 | `phenotype_id` | Phenotype ID 67 | `pval` | Nominal p-value of the association between the phenotype and variant 68 | `b` | Regression slope 69 | `b_se` | Standard error of the regression slope 70 | `r2` | Squared residual genotype-phenotype correlation (only generated if `map_trans(..., return_r2=True)`) 71 | `af` | In-sample ALT allele frequency of the variant 72 | -------------------------------------------------------------------------------- /example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen -------------------------------------------------------------------------------- /example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam: -------------------------------------------------------------------------------- 1 | #IID SEX 2 | HG00096 1 3 | HG00097 2 4 | HG00099 2 5 | HG00100 2 6 | HG00101 1 7 | HG00102 2 8 | HG00103 1 9 | HG00105 1 10 | HG00106 2 11 | HG00108 1 12 | HG00109 1 13 | HG00110 2 14 | HG00111 2 15 | HG00112 1 16 | HG00114 1 17 | HG00115 1 18 | HG00116 1 19 | HG00117 1 20 | HG00118 2 21 | HG00119 1 22 | HG00120 2 23 | HG00121 2 24 | HG00122 2 25 | HG00123 2 26 | HG00125 2 27 | HG00126 1 28 | HG00127 2 29 | HG00128 2 30 | HG00129 1 31 | HG00130 2 32 | HG00131 1 33 | HG00132 2 34 | HG00133 2 35 | HG00136 1 36 | HG00137 2 37 | HG00138 1 38 | HG00139 1 39 | HG00141 1 40 | HG00142 1 41 | HG00143 1 42 | HG00145 1 43 | HG00146 2 44 | HG00148 1 45 | HG00149 1 46 | HG00150 2 47 | HG00151 1 48 | HG00154 2 49 | HG00155 1 50 | HG00157 1 51 | HG00158 2 52 | HG00159 1 53 | HG00160 1 54 | HG00171 2 55 | HG00173 2 56 | HG00174 2 57 | HG00176 2 58 | HG00177 2 59 | HG00178 2 60 | HG00179 2 61 | HG00180 2 62 | HG00181 1 63 | HG00182 1 64 | HG00183 1 65 | HG00185 1 66 | HG00186 1 67 | HG00187 1 68 | HG00188 1 69 | HG00189 1 70 | HG00231 2 71 | HG00232 2 72 | HG00233 2 73 | HG00234 1 74 | HG00235 2 75 | HG00236 2 76 | HG00238 2 77 | HG00239 2 78 | HG00240 2 79 | HG00242 1 80 | HG00243 1 81 | HG00244 1 82 | HG00245 2 83 | HG00246 1 84 | HG00250 2 85 | HG00251 1 86 | HG00252 1 87 | HG00253 2 88 | HG00255 2 89 | HG00256 1 90 | HG00257 2 91 | HG00258 2 92 | HG00259 2 93 | HG00260 1 94 | HG00261 2 95 | HG00262 2 96 | HG00263 2 97 | HG00264 1 98 | HG00265 1 99 | HG00266 2 100 | HG00267 1 101 | HG00268 2 102 | HG00269 2 103 | HG00271 1 104 | HG00272 2 105 | HG00273 1 106 | HG00274 2 107 | HG00275 2 108 | HG00276 2 109 | HG00277 1 110 | HG00278 1 111 | HG00280 1 112 | HG00281 2 113 | HG00282 2 114 | HG00284 1 115 | HG00285 2 116 | HG00306 2 117 | HG00308 1 118 | HG00309 2 119 | HG00310 1 120 | HG00311 1 121 | HG00313 2 122 | HG00315 2 123 | HG00319 2 124 | HG00320 2 125 | HG00321 1 126 | HG00323 2 127 | HG00324 2 128 | HG00325 1 129 | HG00326 2 130 | HG00327 2 131 | HG00328 2 132 | HG00329 1 133 | HG00330 2 134 | HG00331 2 135 | HG00332 2 136 | HG00334 2 137 | HG00335 1 138 | HG00336 1 139 | HG00337 2 140 | HG00338 1 141 | HG00339 2 142 | HG00341 1 143 | HG00342 1 144 | HG00343 2 145 | HG00344 2 146 | HG00345 1 147 | HG00346 2 148 | HG00349 2 149 | HG00350 2 150 | HG00351 1 151 | HG00353 2 152 | HG00355 2 153 | HG00356 2 154 | HG00358 1 155 | HG00360 1 156 | HG00361 2 157 | HG00362 2 158 | HG00364 2 159 | HG00365 2 160 | HG00366 1 161 | HG00367 2 162 | HG00369 1 163 | HG00371 1 164 | HG00372 1 165 | HG00373 2 166 | HG00375 1 167 | HG00376 2 168 | HG00378 2 169 | HG00379 2 170 | HG00380 2 171 | HG00381 2 172 | HG00382 1 173 | HG00383 2 174 | HG00384 2 175 | HG01334 1 176 | HG01789 1 177 | HG01790 2 178 | HG01791 1 179 | HG02215 2 180 | NA06984 1 181 | NA06985 2 182 | NA06986 1 183 | NA06989 2 184 | NA06994 1 185 | NA07037 2 186 | NA07048 1 187 | NA07051 1 188 | NA07056 2 189 | NA07347 1 190 | NA07357 1 191 | NA10847 2 192 | NA10851 1 193 | NA11829 1 194 | NA11830 2 195 | NA11831 1 196 | NA11832 2 197 | NA11840 2 198 | NA11843 1 199 | NA11881 1 200 | NA11892 2 201 | NA11893 1 202 | NA11894 2 203 | NA11918 2 204 | NA11920 2 205 | NA11930 1 206 | NA11931 2 207 | NA11992 1 208 | NA11994 1 209 | NA11995 2 210 | NA12004 2 211 | NA12005 1 212 | NA12006 2 213 | NA12043 1 214 | NA12044 2 215 | NA12045 1 216 | NA12058 2 217 | NA12144 1 218 | NA12154 1 219 | NA12155 1 220 | NA12156 2 221 | NA12234 2 222 | NA12249 2 223 | NA12272 1 224 | NA12273 2 225 | NA12275 2 226 | NA12282 1 227 | NA12283 2 228 | NA12286 1 229 | NA12287 2 230 | NA12340 1 231 | NA12341 2 232 | NA12342 1 233 | NA12347 1 234 | NA12348 2 235 | NA12383 2 236 | NA12399 1 237 | NA12400 2 238 | NA12413 1 239 | NA12489 2 240 | NA12546 1 241 | NA12716 1 242 | NA12717 2 243 | NA12718 2 244 | NA12749 2 245 | NA12750 1 246 | NA12751 2 247 | NA12760 1 248 | NA12761 2 249 | NA12762 1 250 | NA12763 2 251 | NA12775 1 252 | NA12776 2 253 | NA12777 1 254 | NA12778 2 255 | NA12812 1 256 | NA12813 2 257 | NA12814 1 258 | NA12815 2 259 | NA12827 1 260 | NA12829 1 261 | NA12830 2 262 | NA12842 1 263 | NA12843 2 264 | NA12872 1 265 | NA12873 2 266 | NA12874 1 267 | NA12889 1 268 | NA12890 2 269 | NA18486 1 270 | NA18488 2 271 | NA18489 2 272 | NA18498 1 273 | NA18499 2 274 | NA18502 2 275 | NA18505 2 276 | NA18508 2 277 | NA18510 1 278 | NA18511 2 279 | NA18517 2 280 | NA18519 1 281 | NA18520 2 282 | NA18858 2 283 | NA18861 2 284 | NA18867 2 285 | NA18868 1 286 | NA18870 2 287 | NA18873 2 288 | NA18907 2 289 | NA18908 1 290 | NA18909 2 291 | NA18910 1 292 | NA18912 2 293 | NA18916 2 294 | NA18917 1 295 | NA18923 1 296 | NA18933 2 297 | NA18934 1 298 | NA19092 1 299 | NA19093 2 300 | NA19095 2 301 | NA19096 1 302 | NA19098 1 303 | NA19099 2 304 | NA19102 2 305 | NA19107 1 306 | NA19108 2 307 | NA19113 1 308 | NA19114 2 309 | NA19116 2 310 | NA19117 1 311 | NA19118 2 312 | NA19119 1 313 | NA19121 1 314 | NA19129 2 315 | NA19130 1 316 | NA19131 2 317 | NA19137 2 318 | NA19138 1 319 | NA19141 1 320 | NA19143 2 321 | NA19144 1 322 | NA19146 1 323 | NA19147 2 324 | NA19149 2 325 | NA19152 2 326 | NA19153 1 327 | NA19159 2 328 | NA19160 1 329 | NA19171 1 330 | NA19172 2 331 | NA19175 1 332 | NA19184 1 333 | NA19185 2 334 | NA19189 1 335 | NA19190 2 336 | NA19197 2 337 | NA19198 1 338 | NA19200 1 339 | NA19201 2 340 | NA19204 2 341 | NA19206 2 342 | NA19207 1 343 | NA19209 2 344 | NA19210 1 345 | NA19213 1 346 | NA19214 2 347 | NA19222 2 348 | NA19223 1 349 | NA19225 2 350 | NA19235 2 351 | NA19236 1 352 | NA19247 2 353 | NA19248 1 354 | NA19256 1 355 | NA19257 2 356 | NA20502 2 357 | NA20503 2 358 | NA20504 2 359 | NA20505 2 360 | NA20506 2 361 | NA20507 2 362 | NA20508 2 363 | NA20509 1 364 | NA20510 1 365 | NA20512 1 366 | NA20513 1 367 | NA20514 2 368 | NA20515 1 369 | NA20516 1 370 | NA20517 2 371 | NA20518 1 372 | NA20519 1 373 | NA20520 1 374 | NA20521 1 375 | NA20524 1 376 | NA20525 1 377 | NA20527 1 378 | NA20528 1 379 | NA20529 2 380 | NA20530 2 381 | NA20531 2 382 | NA20532 1 383 | NA20534 1 384 | NA20535 2 385 | NA20536 1 386 | NA20538 1 387 | NA20539 1 388 | NA20540 2 389 | NA20541 2 390 | NA20542 2 391 | NA20543 1 392 | NA20544 1 393 | NA20581 1 394 | NA20582 2 395 | NA20585 2 396 | NA20586 1 397 | NA20588 1 398 | NA20589 2 399 | NA20752 1 400 | NA20754 1 401 | NA20756 2 402 | NA20757 2 403 | NA20758 1 404 | NA20759 1 405 | NA20760 2 406 | NA20761 2 407 | NA20765 1 408 | NA20766 2 409 | NA20768 2 410 | NA20769 2 411 | NA20770 1 412 | NA20771 2 413 | NA20772 2 414 | NA20773 2 415 | NA20774 2 416 | NA20778 1 417 | NA20783 1 418 | NA20785 1 419 | NA20786 2 420 | NA20787 1 421 | NA20790 2 422 | NA20792 1 423 | NA20795 2 424 | NA20796 1 425 | NA20797 2 426 | NA20798 1 427 | NA20799 2 428 | NA20800 2 429 | NA20801 1 430 | NA20802 2 431 | NA20803 1 432 | NA20804 2 433 | NA20805 1 434 | NA20806 1 435 | NA20807 2 436 | NA20808 2 437 | NA20809 1 438 | NA20810 1 439 | NA20811 1 440 | NA20812 1 441 | NA20813 2 442 | NA20814 1 443 | NA20815 1 444 | NA20819 2 445 | NA20826 2 446 | NA20828 2 447 | -------------------------------------------------------------------------------- /example/data/GEUVADIS.445_samples.expression.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.expression.bed.gz -------------------------------------------------------------------------------- /example/tensorqtl_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### *cis*- and *trans*-QTL mapping with tensorQTL\n", 8 | "\n", 9 | "This notebook provides examples for running *cis*- and *trans*-QTL mapping with tensorQTL, using open-access data from the [GEUVADIS](https://www.ebi.ac.uk/arrayexpress/experiments/E-GEUV-1/) project.\n", 10 | "\n", 11 | "#### Requirements\n", 12 | "An environment configured with a GPU and ~50GB of memory.\n", 13 | "\n", 14 | "#### Test dataset\n", 15 | "\n", 16 | "*Note: these files are provided for testing/benchmarking purposes only. They do not constitute an official release from the GEUVADIS project, and no quality-control was applied.*\n", 17 | "\n", 18 | "Genotypes in PLINK2 format (chr18 only), and normalized expression data are available [in this repository](./data/); the full dataset is available at [gs://gtex-resources/test_data/geuvadis](https://console.cloud.google.com/storage/browser/gtex-resources/test_data/geuvadis) ([requester pays](https://cloud.google.com/storage/docs/requester-pays))." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "torch: 2.5.1+cu124 (CUDA 12.4), device: cuda\n", 31 | "pandas: 2.2.3\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "import torch\n", 38 | "import tensorqtl\n", 39 | "from tensorqtl import pgen, cis, trans, post\n", 40 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 41 | "print(f\"torch: {torch.__version__} (CUDA {torch.version.cuda}), device: {device}\")\n", 42 | "print(f\"pandas: {pd.__version__}\")\n", 43 | "\n", 44 | "# define paths to data\n", 45 | "plink_prefix_path = 'data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18'\n", 46 | "expression_bed = 'data/GEUVADIS.445_samples.expression.bed.gz'\n", 47 | "covariates_file = 'data/GEUVADIS.445_samples.covariates.txt'\n", 48 | "prefix = 'GEUVADIS.445_samples'\n", 49 | "\n", 50 | "# load phenotypes and covariates\n", 51 | "phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)\n", 52 | "covariates_df = pd.read_csv(covariates_file, sep='\\t', index_col=0).T\n", 53 | "\n", 54 | "# PLINK reader for genotypes\n", 55 | "pgr = pgen.PgenReader(plink_prefix_path)\n", 56 | "genotype_df = pgr.load_genotypes()\n", 57 | "variant_df = pgr.variant_df" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### *cis*-QTL: nominal p-values for all variant-phenotype pairs" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "cis-QTL mapping: nominal associations for all variant-phenotype pairs\n", 77 | " * 445 samples\n", 78 | " * 301 phenotypes\n", 79 | " * 26 covariates\n", 80 | " * 367759 variants\n", 81 | " * cis-window: ±1,000,000\n", 82 | " * checking phenotypes: 301/301\n", 83 | " * Computing associations\n", 84 | " Mapping chromosome chr18\n", 85 | " processing phenotype 301/301\n", 86 | " time elapsed: 0.04 min\n", 87 | " * writing output\n", 88 | "done.\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "# map all cis-associations (results for each chromosome are written to file)\n", 94 | "\n", 95 | "# all genes\n", 96 | "# cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix, covariates_df=covariates_df)\n", 97 | "\n", 98 | "# genes on chr18\n", 99 | "cis.map_nominal(genotype_df, variant_df,\n", 100 | " phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n", 101 | " phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n", 102 | " prefix, covariates_df=covariates_df)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/html": [ 113 | "
\n", 114 | "\n", 127 | "\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "
phenotype_idvariant_idstart_distanceafma_samplesma_countpval_nominalslopeslope_se
0ENSG00000263006.6chr18_10644_C_G_b38-984210.01685415150.580873-0.1177610.213125
1ENSG00000263006.6chr18_10847_C_A_b38-982180.01910117170.142884-0.2987260.203505
2ENSG00000263006.6chr18_11275_G_A_b38-977900.02471922220.7452310.0546190.167981
3ENSG00000263006.6chr18_11358_G_A_b38-977070.02471922220.7452310.0546190.167981
4ENSG00000263006.6chr18_11445_G_A_b38-976200.02359621210.6032760.0893780.171851
\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " phenotype_id variant_id start_distance af \\\n", 209 | "0 ENSG00000263006.6 chr18_10644_C_G_b38 -98421 0.016854 \n", 210 | "1 ENSG00000263006.6 chr18_10847_C_A_b38 -98218 0.019101 \n", 211 | "2 ENSG00000263006.6 chr18_11275_G_A_b38 -97790 0.024719 \n", 212 | "3 ENSG00000263006.6 chr18_11358_G_A_b38 -97707 0.024719 \n", 213 | "4 ENSG00000263006.6 chr18_11445_G_A_b38 -97620 0.023596 \n", 214 | "\n", 215 | " ma_samples ma_count pval_nominal slope slope_se \n", 216 | "0 15 15 0.580873 -0.117761 0.213125 \n", 217 | "1 17 17 0.142884 -0.298726 0.203505 \n", 218 | "2 22 22 0.745231 0.054619 0.167981 \n", 219 | "3 22 22 0.745231 0.054619 0.167981 \n", 220 | "4 21 21 0.603276 0.089378 0.171851 " 221 | ] 222 | }, 223 | "execution_count": 3, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "# load results\n", 230 | "pairs_df = pd.read_parquet(f'{prefix}.cis_qtl_pairs.chr18.parquet')\n", 231 | "pairs_df.head()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### *cis*-QTL: empirical p-values for phenotypes" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "cis-QTL mapping: empirical p-values for phenotypes\n", 251 | " * 445 samples\n", 252 | " * 301 phenotypes\n", 253 | " * 26 covariates\n", 254 | " * 367759 variants\n", 255 | " * cis-window: ±1,000,000\n", 256 | " * using seed 123456\n", 257 | " * checking phenotypes: 301/301\n", 258 | " * computing permutations\n", 259 | " processing phenotype 301/301\n", 260 | " Time elapsed: 0.31 min\n", 261 | "done.\n", 262 | "Computing q-values\n", 263 | " * Number of phenotypes tested: 301\n", 264 | " * Correlation between Beta-approximated and empirical p-values: 1.0000\n", 265 | " * Calculating q-values with lambda = 0.850\n", 266 | " * Proportion of significant phenotypes (1-pi0): 0.76\n", 267 | " * QTL phenotypes @ FDR 0.05: 205\n", 268 | " * min p-value threshold @ FDR 0.05: 0.135284\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "# all genes\n", 274 | "# cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df)\n", 275 | "\n", 276 | "# genes on chr18\n", 277 | "cis_df = cis.map_cis(genotype_df, variant_df, \n", 278 | " phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n", 279 | " phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n", 280 | " covariates_df=covariates_df, seed=123456)\n", 281 | "# compute q-values (in practice, this must be run on all genes, not a subset)\n", 282 | "post.calculate_qvalues(cis_df, fdr=0.05, qvalue_lambda=0.85)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 5, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/html": [ 293 | "
\n", 294 | "\n", 307 | "\n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | "
num_varbeta_shape1beta_shape2true_dfpval_true_dfvariant_idstart_distanceend_distancema_samplesma_countafpval_nominalslopeslope_sepval_permpval_betaqvalpval_nominal_threshold
phenotype_id
ENSG00000263006.661201.0388111138.434082374.6604008.220950e-40chr18_112535_G_A_b38347034702122510.2820224.050344e-440.7264250.0461710.0001003.677735e-382.697006e-370.000141
ENSG00000101557.1463551.0322371076.303223370.1764225.632806e-11chr18_210698_T_C_b3852315523151922220.2494383.505411e-12-0.1917120.0267490.0001003.498951e-083.563747e-080.000146
ENSG00000079134.1169211.0472191155.660156370.3560493.888738e-08chr18_243547_T_A_b38-24503-245032933830.4303375.473709e-09-0.1227200.0206020.0001002.743975e-051.916427e-050.000141
ENSG00000263884.169211.0398061152.501587369.8735057.681884e-04chr18_584440_G_C_b3831629231629281880.0988763.540399e-04-0.3308110.0918450.5748435.695498e-011.577698e-010.000139
ENSG00000158270.1181341.0549191277.927246369.4690862.516529e-09chr18_519222_C_T_b3818500185001081150.1292132.409717e-10-0.3882770.0598080.0001001.567348e-061.321136e-060.000130
\n", 460 | "
" 461 | ], 462 | "text/plain": [ 463 | " num_var beta_shape1 beta_shape2 true_df \\\n", 464 | "phenotype_id \n", 465 | "ENSG00000263006.6 6120 1.038811 1138.434082 374.660400 \n", 466 | "ENSG00000101557.14 6355 1.032237 1076.303223 370.176422 \n", 467 | "ENSG00000079134.11 6921 1.047219 1155.660156 370.356049 \n", 468 | "ENSG00000263884.1 6921 1.039806 1152.501587 369.873505 \n", 469 | "ENSG00000158270.11 8134 1.054919 1277.927246 369.469086 \n", 470 | "\n", 471 | " pval_true_df variant_id start_distance \\\n", 472 | "phenotype_id \n", 473 | "ENSG00000263006.6 8.220950e-40 chr18_112535_G_A_b38 3470 \n", 474 | "ENSG00000101557.14 5.632806e-11 chr18_210698_T_C_b38 52315 \n", 475 | "ENSG00000079134.11 3.888738e-08 chr18_243547_T_A_b38 -24503 \n", 476 | "ENSG00000263884.1 7.681884e-04 chr18_584440_G_C_b38 316292 \n", 477 | "ENSG00000158270.11 2.516529e-09 chr18_519222_C_T_b38 18500 \n", 478 | "\n", 479 | " end_distance ma_samples ma_count af \\\n", 480 | "phenotype_id \n", 481 | "ENSG00000263006.6 3470 212 251 0.282022 \n", 482 | "ENSG00000101557.14 52315 192 222 0.249438 \n", 483 | "ENSG00000079134.11 -24503 293 383 0.430337 \n", 484 | "ENSG00000263884.1 316292 81 88 0.098876 \n", 485 | "ENSG00000158270.11 18500 108 115 0.129213 \n", 486 | "\n", 487 | " pval_nominal slope slope_se pval_perm pval_beta \\\n", 488 | "phenotype_id \n", 489 | "ENSG00000263006.6 4.050344e-44 0.726425 0.046171 0.000100 3.677735e-38 \n", 490 | "ENSG00000101557.14 3.505411e-12 -0.191712 0.026749 0.000100 3.498951e-08 \n", 491 | "ENSG00000079134.11 5.473709e-09 -0.122720 0.020602 0.000100 2.743975e-05 \n", 492 | "ENSG00000263884.1 3.540399e-04 -0.330811 0.091845 0.574843 5.695498e-01 \n", 493 | "ENSG00000158270.11 2.409717e-10 -0.388277 0.059808 0.000100 1.567348e-06 \n", 494 | "\n", 495 | " qval pval_nominal_threshold \n", 496 | "phenotype_id \n", 497 | "ENSG00000263006.6 2.697006e-37 0.000141 \n", 498 | "ENSG00000101557.14 3.563747e-08 0.000146 \n", 499 | "ENSG00000079134.11 1.916427e-05 0.000141 \n", 500 | "ENSG00000263884.1 1.577698e-01 0.000139 \n", 501 | "ENSG00000158270.11 1.321136e-06 0.000130 " 502 | ] 503 | }, 504 | "execution_count": 5, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "cis_df.head()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "### *trans*-QTL mapping" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 6, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "trans-QTL mapping\n", 530 | " * 445 samples\n", 531 | " * 19836 phenotypes\n", 532 | " * 26 covariates\n", 533 | " * 367759 variants\n", 534 | " processing batch 37/37\n", 535 | " elapsed time: 0.02 min\n", 536 | " * 210838 variants passed MAF >= 0.05 filtering\n", 537 | "done.\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "# run mapping\n", 543 | "# to limit output size, only associations with p-value <= 1e-5 are returned\n", 544 | "trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, batch_size=10000,\n", 545 | " return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 7, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "# remove cis-associations\n", 555 | "trans_df = trans.filter_cis(trans_df, phenotype_pos_df, variant_df, window=5000000)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 8, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/html": [ 566 | "
\n", 567 | "\n", 580 | "\n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | "
variant_idphenotype_idpvalbb_seaf
1chr18_20683_A_G_b38ENSG00000163900.105.012229e-060.2095400.0453090.179775
3chr18_27346_G_T_b38ENSG00000164088.177.309937e-06-0.2656230.0584830.123596
11chr18_43564_G_A_b38ENSG00000198162.121.314060e-07-0.2029220.0377920.093258
12chr18_43564_G_A_b38ENSG00000261098.18.494569e-06-0.4219680.0935940.093258
13chr18_43611_C_T_b38ENSG00000265972.51.448981e-06-0.2723010.0556970.135955
\n", 640 | "
" 641 | ], 642 | "text/plain": [ 643 | " variant_id phenotype_id pval b b_se \\\n", 644 | "1 chr18_20683_A_G_b38 ENSG00000163900.10 5.012229e-06 0.209540 0.045309 \n", 645 | "3 chr18_27346_G_T_b38 ENSG00000164088.17 7.309937e-06 -0.265623 0.058483 \n", 646 | "11 chr18_43564_G_A_b38 ENSG00000198162.12 1.314060e-07 -0.202922 0.037792 \n", 647 | "12 chr18_43564_G_A_b38 ENSG00000261098.1 8.494569e-06 -0.421968 0.093594 \n", 648 | "13 chr18_43611_C_T_b38 ENSG00000265972.5 1.448981e-06 -0.272301 0.055697 \n", 649 | "\n", 650 | " af \n", 651 | "1 0.179775 \n", 652 | "3 0.123596 \n", 653 | "11 0.093258 \n", 654 | "12 0.093258 \n", 655 | "13 0.135955 " 656 | ] 657 | }, 658 | "execution_count": 8, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "trans_df.head()" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [] 673 | } 674 | ], 675 | "metadata": { 676 | "kernelspec": { 677 | "display_name": "Python 3 (ipykernel)", 678 | "language": "python", 679 | "name": "python3" 680 | }, 681 | "language_info": { 682 | "codemirror_mode": { 683 | "name": "ipython", 684 | "version": 3 685 | }, 686 | "file_extension": ".py", 687 | "mimetype": "text/x-python", 688 | "name": "python", 689 | "nbconvert_exporter": "python", 690 | "pygments_lexer": "ipython3", 691 | "version": "3.11.9" 692 | } 693 | }, 694 | "nbformat": 4, 695 | "nbformat_minor": 4 696 | } 697 | -------------------------------------------------------------------------------- /install/INSTALL.md: -------------------------------------------------------------------------------- 1 | ### Setup CUDA drivers and PyTorch on GCP 2 | 3 | Launch a new instance configured with Ubuntu 22.04 LTS and a GPU, clone this repository, and run the following: 4 | #### Install CUDA 5 | ```bash 6 | sudo ./install_cuda.sh 7 | sudo reboot 8 | # verify 9 | nvidia-smi 10 | ``` 11 | 12 | #### Install R 13 | Required for computing q-values. Follow instructions [here](https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-22-04), then install the 'qvalue' package with 14 | ```bash 15 | if (!require("BiocManager", quietly = TRUE)) 16 | install.packages("BiocManager") 17 | BiocManager::install("qvalue") 18 | ``` 19 | 20 | #### Install Python 3 21 | Using a [conda](https://github.com/conda-forge/miniforge) environment is recommended. The `tensorqtl_env.yml` configuration contains all required packages, including `torch` and `tensorqtl`. 22 | ```bash 23 | mamba env create -f tensorqtl_env.yml 24 | conda activate tensorqtl 25 | 26 | # verify 27 | python -c "import torch; print(torch.__version__); print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))" 28 | 29 | # this should print something like 30 | # 2.1.2+cu121 31 | # CUDA available: True (Tesla P100-PCIE-16GB) 32 | ``` 33 | 34 | #### Install rmate (optional) 35 | ```bash 36 | sudo apt install -y ruby 37 | mkdir ~/bin 38 | curl -Lo ~/bin/rmate https://raw.githubusercontent.com/textmate/rmate/master/bin/rmate 39 | chmod a+x ~/bin/rmate 40 | echo 'export RMATE_PORT=${rmate_port}' >> ~/.bashrc 41 | ``` 42 | -------------------------------------------------------------------------------- /install/install_cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install script for PyTorch 2.1.2 + CUDA 12.1 on Ubuntu 22.04 3 | # for torch, see https://pytorch.org/get-started/locally/ 4 | # for CUDA drivers, see https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local 5 | # for other versions, see https://developer.nvidia.com/cuda-toolkit-archive 6 | 7 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin 8 | sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 9 | wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb 10 | sudo dpkg -i cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb 11 | sudo cp /var/cuda-repo-ubuntu2204-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/ 12 | sudo apt-get update 13 | sudo apt-get -y install cuda 14 | rm cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb 15 | 16 | # test 17 | python -c "import torch; print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))" 18 | -------------------------------------------------------------------------------- /install/tensorqtl_env.yml: -------------------------------------------------------------------------------- 1 | name: tensorqtl 2 | dependencies: 3 | - python=3.11 4 | - pip 5 | - pip: 6 | - numpy 7 | - pandas 8 | - pandas-plink 9 | - Pgenlib>=0.90.1 10 | - pyarrow 11 | - qtl 12 | - rpy2 13 | - scipy 14 | - torch 15 | - tensorqtl 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools] 6 | packages = ["tensorqtl"] 7 | 8 | [project] 9 | name = "tensorqtl" 10 | version = "1.0.10" 11 | dependencies = [ 12 | "numpy", 13 | "pandas", 14 | "Pgenlib>=0.90.1", 15 | "qtl", 16 | "scipy", 17 | "torch", 18 | ] 19 | authors = [ 20 | {name = "Francois Aguet", email = "francois@broadinstitute.org"} 21 | ] 22 | maintainers = [ 23 | {name = "Francois Aguet", email = "francois@broadinstitute.org"} 24 | ] 25 | description = "GPU-accelerated QTL mapper" 26 | readme = "README.md" 27 | license = {file = "LICENSE"} 28 | keywords = ["Quantitative trait loci"] 29 | classifiers = [ 30 | "Development Status :: 4 - Beta", 31 | "Programming Language :: Python :: 3", 32 | "Intended Audience :: Science/Research", 33 | "Topic :: Scientific/Engineering :: Bio-Informatics", 34 | ] 35 | 36 | [project.urls] 37 | Repository = "https://github.com/broadinstitute/tensorqtl.git" 38 | -------------------------------------------------------------------------------- /tensorqtl/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | from .tensorqtl import * 3 | 4 | __version__ = importlib.metadata.version(__name__) 5 | -------------------------------------------------------------------------------- /tensorqtl/__main__.py: -------------------------------------------------------------------------------- 1 | import tensorqtl 2 | tensorqtl.main() 3 | -------------------------------------------------------------------------------- /tensorqtl/coloc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as stats 4 | import torch 5 | import os 6 | import time 7 | import sys 8 | sys.path.insert(1, os.path.dirname(__file__)) 9 | import genotypeio, eigenmt 10 | from core import * 11 | 12 | 13 | def logsumexp(x, dim=0): 14 | mmax,_ = torch.max(x, dim=dim, keepdim=True) 15 | return mmax + (x-mmax).exp().sum(dim, keepdim=True).log() 16 | 17 | 18 | def logdiff(x, y, dim=0): 19 | xmax,_ = torch.max(x, dim=dim, keepdim=True) 20 | ymax,_ = torch.max(y, dim=dim, keepdim=True) 21 | mmax = torch.max(xmax, ymax) 22 | return mmax + ((x - mmax).exp() - (y - mmax).exp()).log() 23 | 24 | 25 | def coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t, 26 | residualizer1=None, residualizer2=None, mode='beta', 27 | p1=1e-4, p2=1e-4, p12=1e-5): 28 | """COLOC from summary statistics (either beta/sds or p-values and MAF)""" 29 | 30 | assert phenotype1_t.dim() == 1 31 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | 33 | # phenotype 1 34 | if mode == 'beta': 35 | r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr( 36 | genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=True) 37 | r_nominal_t = r_nominal_t.squeeze() 38 | var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1) 39 | else: 40 | r_nominal_t = calculate_corr( 41 | genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=False).squeeze() 42 | r2_nominal_t = r_nominal_t.double().pow(2) 43 | 44 | if residualizer1 is not None: 45 | dof = residualizer1.dof 46 | else: 47 | dof = phenotype1_t.shape[0] - 2 48 | 49 | if mode == 'beta': 50 | tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t) 51 | beta2_t = r2_nominal_t * var_ratio_t.squeeze() 52 | beta_var_t = beta2_t / tstat2_t 53 | var_prior = 0.0225 * phenotype_var_t 54 | r = var_prior / (var_prior + beta_var_t) 55 | l1 = 0.5 * ((1 - r).log() + r*tstat2_t) 56 | else: 57 | # compute p-values and z-score to match COLOC results exactly 58 | # (instead of directly using t-statistic) 59 | tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t)) 60 | p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof) # 2 dropped since canceled in isf 61 | maf_t = calculate_maf(genotypes1_t) 62 | N = phenotype1_t.shape[0] 63 | v = 1 / (2 * N * maf_t * (1 - maf_t)) 64 | z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device) 65 | r = 0.0225 / (0.0225 + v) 66 | l1 = 0.5 * ((1 - r).log() + r*z2_t) 67 | 68 | # phenotype 2 69 | if phenotype2_t.dim() == 1: 70 | num_phenotypes = 1 71 | num_samples = phenotype2_t.shape[0] 72 | phenotype2_t = phenotype2_t.reshape(1,-1) 73 | else: 74 | num_phenotypes, num_samples = phenotype2_t.shape 75 | 76 | if mode == 'beta': 77 | r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr( 78 | genotypes2_t, phenotype2_t, residualizer2, return_var=True) 79 | r_nominal_t = r_nominal_t.squeeze() 80 | var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1) 81 | else: 82 | r_nominal_t = calculate_corr(genotypes2_t, phenotype2_t, residualizer2, return_var=False).squeeze() 83 | r2_nominal_t = r_nominal_t.double().pow(2) 84 | 85 | if residualizer2 is not None: 86 | dof = residualizer2.dof 87 | else: 88 | dof = num_samples - 2 89 | 90 | if mode == 'beta': 91 | tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t) 92 | beta2_t = r2_nominal_t * var_ratio_t.squeeze() 93 | beta_var_t = beta2_t / tstat2_t 94 | var_prior = 0.0225 * phenotype_var_t 95 | r = var_prior / (var_prior + beta_var_t) 96 | l2 = 0.5 * ((1 - r).log() + r*tstat2_t) 97 | else: 98 | tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t)) 99 | p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof) 100 | maf_t = calculate_maf(genotypes2_t) 101 | v = 1 / (2 * num_samples * maf_t * (1 - maf_t)) 102 | z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device) 103 | r = 0.0225 / (0.0225 + v) 104 | if num_phenotypes > 1: 105 | r = r.reshape(-1,1) 106 | l2 = 0.5 * ((1 - r).log() + r*z2_t) 107 | 108 | if num_phenotypes > 1: 109 | lsum = l1.reshape(-1,1) + l2 110 | lh0_abf = torch.zeros([1, num_phenotypes]).to(device) 111 | lh1_abf = np.log(p1) + logsumexp(l1).repeat([1, num_phenotypes]) 112 | else: 113 | lsum = l1 + l2 114 | lh0_abf = torch.zeros([1]).to(device) 115 | lh1_abf = np.log(p1) + logsumexp(l1) 116 | lh2_abf = np.log(p2) + logsumexp(l2) 117 | lh3_abf = np.log(p1) + np.log(p2) + logdiff(logsumexp(l1) + logsumexp(l2), logsumexp(lsum)) 118 | lh4_abf = np.log(p12) + logsumexp(lsum) 119 | all_abf = torch.cat([lh0_abf, lh1_abf, lh2_abf, lh3_abf, lh4_abf]) 120 | return (all_abf - logsumexp(all_abf, dim=0)).exp().squeeze() 121 | 122 | 123 | def run_pairs(genotype_df, variant_df, phenotype1_df, phenotype2_df, phenotype_pos_df, 124 | covariates1_df=None, covariates2_df=None, p1=1e-4, p2=1e-4, p12=1e-5, mode='beta', 125 | maf_threshold=0, window=1000000, batch_size=10000, logger=None, verbose=True): 126 | """Compute COLOC for all phenotype pairs""" 127 | 128 | assert np.all(phenotype1_df.index == phenotype2_df.index) 129 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 130 | 131 | if logger is None: 132 | logger = SimpleLogger() 133 | 134 | logger.write('Computing COLOC for all pairs of phenotypes') 135 | logger.write(f' * {phenotype1_df.shape[0]} phenotypes') 136 | logger.write(f' * phenotype group 1: {phenotype1_df.shape[1]} samples') 137 | logger.write(f' * phenotype group 2: {phenotype2_df.shape[1]} samples') 138 | 139 | if covariates1_df is not None: 140 | assert np.all(phenotype1_df.columns == covariates1_df.index) 141 | logger.write(f' * phenotype group 1: {covariates1_df.shape[1]} covariates') 142 | residualizer1 = Residualizer(torch.tensor(covariates1_df.values, dtype=torch.float32).to(device)) 143 | else: 144 | residualizer1 = None 145 | 146 | if covariates2_df is not None: 147 | assert np.all(phenotype2_df.columns == covariates2_df.index) 148 | logger.write(f' * phenotype group 2: {covariates2_df.shape[1]} covariates') 149 | residualizer2 = Residualizer(torch.tensor(covariates2_df.values, dtype=torch.float32).to(device)) 150 | else: 151 | residualizer2 = None 152 | 153 | if maf_threshold > 0: 154 | logger.write(f' * applying in-sample {maf_threshold} MAF filter (in at least one cohort)') 155 | 156 | genotype1_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype1_df.columns]) 157 | genotype1_ix_t = torch.from_numpy(genotype1_ix).to(device) 158 | genotype2_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype2_df.columns]) 159 | genotype2_ix_t = torch.from_numpy(genotype2_ix).to(device) 160 | 161 | igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype1_df, phenotype_pos_df, window=window) 162 | coloc_df = [] 163 | start_time = time.time() 164 | logger.write(' * Computing pairwise colocalization') 165 | for phenotype1, genotypes, genotype_range, phenotype_id in igc.generate_data(verbose=verbose): 166 | phenotype2 = phenotype2_df.loc[phenotype_id] 167 | 168 | # copy to GPU 169 | phenotype1_t = torch.tensor(phenotype1, dtype=torch.float).to(device) 170 | phenotype2_t = torch.tensor(phenotype2, dtype=torch.float).to(device) 171 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 172 | genotypes1_t = genotypes_t[:,genotype1_ix_t] 173 | genotypes2_t = genotypes_t[:,genotype2_ix_t] 174 | del genotypes_t 175 | 176 | impute_mean(genotypes1_t) 177 | impute_mean(genotypes2_t) 178 | # filter monomorphic sites 179 | m = ((genotypes1_t==0).all(1) | (genotypes1_t==1).all(1) | (genotypes1_t==2).all(1) | 180 | (genotypes2_t==0).all(1) | (genotypes2_t==1).all(1) | (genotypes2_t==2).all(1)) 181 | genotypes1_t = genotypes1_t[~m] 182 | genotypes2_t = genotypes2_t[~m] 183 | 184 | if maf_threshold > 0: 185 | maf1_t = calculate_maf(genotypes1_t) 186 | maf2_t = calculate_maf(genotypes2_t) 187 | mask_t = (maf1_t >= maf_threshold) | (maf2_t >= maf_threshold) 188 | genotypes1_t = genotypes1_t[mask_t] 189 | genotypes2_t = genotypes2_t[mask_t] 190 | 191 | coloc_t = coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t, 192 | residualizer1=residualizer1, residualizer2=residualizer2, 193 | p1=p1, p2=p2, p12=p12, mode=mode) 194 | coloc_df.append(coloc_t.cpu().numpy()) 195 | logger.write(' time elapsed: {:.2f} min'.format((time.time()-start_time)/60)) 196 | coloc_df = pd.DataFrame(coloc_df, columns=[f'pp_h{i}_abf' for i in range(5)], index=phenotype1_df.index) 197 | logger.write('done.') 198 | return coloc_df 199 | -------------------------------------------------------------------------------- /tensorqtl/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.stats as stats 5 | import scipy.optimize 6 | from scipy.special import loggamma 7 | import sys 8 | import re 9 | import subprocess 10 | 11 | # check R 12 | has_rpy2 = False 13 | try: 14 | subprocess.check_call('which R', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 15 | subprocess.check_call("R -e 'library(qvalue)'", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 16 | import rpy2 17 | import rfunc 18 | has_rpy2 = True 19 | except: 20 | print("Warning: 'rfunc' cannot be imported. R with the 'qvalue' library and the 'rpy2' Python package are needed to compute q-values.") 21 | 22 | 23 | output_dtype_dict = { 24 | 'num_var':np.int32, 25 | 'beta_shape1':np.float32, 26 | 'beta_shape2':np.float32, 27 | 'true_df':np.float32, 28 | 'pval_true_df':np.float64, 29 | 'variant_id':str, 30 | 'start_distance':np.int32, 31 | 'end_distance':np.int32, 32 | 'ma_samples':np.int32, 33 | 'ma_count':np.int32, 34 | 'af':np.float32, 35 | 'pval_nominal':np.float64, 36 | 'slope':np.float32, 37 | 'slope_se':np.float32, 38 | 'pval_perm':np.float64, 39 | 'pval_beta':np.float64, 40 | } 41 | 42 | 43 | class SimpleLogger(object): 44 | def __init__(self, logfile=None, verbose=True): 45 | self.console = sys.stdout 46 | self.verbose = verbose 47 | if logfile is not None: 48 | self.log = open(logfile, 'w') 49 | else: 50 | self.log = None 51 | 52 | def write(self, message): 53 | if self.verbose: 54 | self.console.write(message+'\n') 55 | if self.log is not None: 56 | self.log.write(message+'\n') 57 | self.log.flush() 58 | 59 | #------------------------------------------------------------------------------ 60 | # Core classes/functions for mapping associations on GPU 61 | #------------------------------------------------------------------------------ 62 | class Residualizer(object): 63 | def __init__(self, C_t): 64 | # center and orthogonalize 65 | self.Q_t, _ = torch.linalg.qr(C_t - C_t.mean(0)) 66 | self.dof = C_t.shape[0] - 2 - C_t.shape[1] 67 | 68 | def transform(self, M_t, center=True): 69 | """Residualize rows of M wrt columns of C""" 70 | M0_t = M_t - M_t.mean(1, keepdim=True) 71 | if center: 72 | M0_t = M0_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t()) 73 | else: 74 | M0_t = M_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t()) 75 | return M0_t 76 | 77 | 78 | def calculate_maf(genotype_t, alleles=2): 79 | """Calculate minor allele frequency""" 80 | af_t = genotype_t.sum(1) / (alleles * genotype_t.shape[1]) 81 | return torch.where(af_t > 0.5, 1 - af_t, af_t) 82 | 83 | 84 | def get_allele_stats(genotype_t): 85 | """Returns allele frequency, minor allele samples, and minor allele counts (row-wise).""" 86 | # allele frequency 87 | n2 = 2 * genotype_t.shape[1] 88 | af_t = genotype_t.sum(1) / n2 89 | # minor allele samples and counts 90 | ix_t = af_t <= 0.5 91 | m = genotype_t > 0.5 92 | a = m.sum(1).int() 93 | b = (genotype_t < 1.5).sum(1).int() 94 | ma_samples_t = torch.where(ix_t, a, b) 95 | a = (genotype_t * m.float()).sum(1).int() 96 | # a = (genotype_t * m.float()).sum(1).round().int() # round for missing/imputed genotypes 97 | ma_count_t = torch.where(ix_t, a, n2-a) 98 | return af_t, ma_samples_t, ma_count_t 99 | 100 | 101 | def filter_maf(genotypes_t, variant_ids, maf_threshold, alleles=2): 102 | """Calculate MAF and filter genotypes that don't pass threshold""" 103 | af_t = genotypes_t.sum(1) / (alleles * genotypes_t.shape[1]) 104 | maf_t = torch.where(af_t > 0.5, 1 - af_t, af_t) 105 | if maf_threshold > 0: 106 | mask_t = maf_t >= maf_threshold 107 | genotypes_t = genotypes_t[mask_t] 108 | variant_ids = variant_ids[mask_t.cpu().numpy().astype(bool)] 109 | af_t = af_t[mask_t] 110 | return genotypes_t, variant_ids, af_t 111 | 112 | 113 | def filter_maf_interaction(genotypes_t, interaction_mask_t=None, maf_threshold_interaction=0.05): 114 | # filter monomorphic sites (to avoid colinearity) 115 | mask_t = ~((genotypes_t==0).all(1) | (genotypes_t==1).all(1) | (genotypes_t==2).all(1)) 116 | if interaction_mask_t is not None: 117 | upper_t = calculate_maf(genotypes_t[:, interaction_mask_t]) >= maf_threshold_interaction - 1e-7 118 | lower_t = calculate_maf(genotypes_t[:,~interaction_mask_t]) >= maf_threshold_interaction - 1e-7 119 | mask_t = mask_t & upper_t & lower_t 120 | genotypes_t = genotypes_t[mask_t] 121 | return genotypes_t, mask_t 122 | 123 | 124 | def impute_mean(genotypes_t, missing=-9): 125 | """Impute missing genotypes to mean""" 126 | m = genotypes_t == missing 127 | ix = torch.nonzero(m, as_tuple=True)[0] 128 | if len(ix) > 0: 129 | a = genotypes_t.sum(1) 130 | b = m.sum(1).float() 131 | mu = (a - missing*b) / (genotypes_t.shape[1] - b) 132 | genotypes_t[m] = mu[ix] 133 | 134 | 135 | def center_normalize(M_t, dim=0): 136 | """Center and normalize M""" 137 | N_t = M_t - M_t.mean(dim=dim, keepdim=True) 138 | return N_t / torch.sqrt(torch.pow(N_t, 2).sum(dim=dim, keepdim=True)) 139 | 140 | 141 | def calculate_corr(genotype_t, phenotype_t, residualizer=None, return_var=False): 142 | """Calculate correlation between normalized residual genotypes and phenotypes""" 143 | 144 | # residualize 145 | if residualizer is not None: 146 | genotype_res_t = residualizer.transform(genotype_t) # variants x samples 147 | phenotype_res_t = residualizer.transform(phenotype_t) # phenotypes x samples 148 | else: 149 | genotype_res_t = genotype_t 150 | phenotype_res_t = phenotype_t 151 | 152 | if return_var: 153 | genotype_var_t = genotype_res_t.var(1) 154 | phenotype_var_t = phenotype_res_t.var(1) 155 | 156 | # center and normalize 157 | genotype_res_t = center_normalize(genotype_res_t, dim=1) 158 | phenotype_res_t = center_normalize(phenotype_res_t, dim=1) 159 | 160 | # correlation 161 | if return_var: 162 | return torch.mm(genotype_res_t, phenotype_res_t.t()), genotype_var_t, phenotype_var_t 163 | else: 164 | return torch.mm(genotype_res_t, phenotype_res_t.t()) 165 | 166 | 167 | def get_t_pval(t, df, log=False): 168 | """ 169 | Get p-value corresponding to t statistic and degrees of freedom (df). t and/or df can be arrays. 170 | If log=True, returns -log10(P). 171 | """ 172 | if not log: 173 | return 2 * stats.t.cdf(-abs(t), df) 174 | else: 175 | if has_rpy2: 176 | return -(rfunc.t_cdf(-abs(t), df, lower_tail=True, log=True) + np.log(2)) * np.log10(np.e) 177 | else: 178 | raise ValueError("R and rpy2 are required to compute -log10(P)") 179 | 180 | 181 | def calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t, residualizer=None, 182 | return_sparse=False, tstat_threshold=None, variant_ids=None): 183 | """ 184 | Solve y ~ g + i + g:i, where i is an interaction vector or matrix 185 | 186 | Inputs 187 | genotypes_t: [num_genotypes x num_samples] 188 | phenotypes_t: [num_phenotypes x num_samples] 189 | interaction_t: [num_samples x num_interactions] 190 | 191 | Outputs 192 | if return_sparse is False (default): 193 | tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t 194 | tstat_t, b_t, b_se_t columns: [g, i_1 ... i_n, gi_1, ... gi_n] 195 | where n is the number of interactions 196 | if return_sparse is True: 197 | tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix 198 | ix: indexes [genotype, phenotype] 199 | """ 200 | ng, ns = genotypes_t.shape 201 | nps = phenotypes_t.shape[0] 202 | ni = interaction_t.shape[1] 203 | 204 | # centered inputs 205 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True) # genotypes x samples 206 | gi_t = (genotypes_t.unsqueeze(2) * interaction_t.unsqueeze(0)) # genotypes x samples x interactions 207 | gi0_t = gi_t - gi_t.mean(1, keepdim=True) # mean across samples 208 | i0_t = interaction_t - interaction_t.mean(0) # samples x interactions 209 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True) # 1 x samples 210 | 211 | # residualize rows 212 | if residualizer is not None: 213 | p0_t = residualizer.transform(p0_t, center=False) 214 | g0_t = residualizer.transform(g0_t, center=False) 215 | i0_t = residualizer.transform(i0_t.t(), center=False).t() 216 | for k in range(i0_t.shape[1]): 217 | gi0_t[..., k] = residualizer.transform(gi0_t[..., k], center=False) 218 | i0_t = i0_t.repeat(ng, 1, 1) 219 | 220 | # regression (in float; loss of precision may occur in edge cases) 221 | X_t = torch.cat([g0_t.unsqueeze(-1), i0_t, gi0_t], 2) # ng x ns x (1+2*ni) 222 | try: 223 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x (1+2*ni) x (1+2*ni) 224 | except Exception as e: 225 | if variant_ids is not None and len(e.args) >= 1: 226 | i = int(re.findall('For batch (\d+)', str(e))[0]) 227 | e.args = (e.args[0] + f'\n Likely problematic variant: {variant_ids[i]} ',) + e.args[1:] 228 | raise 229 | 230 | p0_tile_t = p0_t.unsqueeze(0).expand([ng, *p0_t.shape]) # ng x np x ns 231 | 232 | # calculate b, b_se 233 | # [(ng x nb x nb) x (ng x nb x ns)] x (ng x ns x np) = (ng x nb x np) 234 | b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), torch.transpose(p0_tile_t, 1, 2)) 235 | nb = b_t.shape[1] 236 | # residualizer.dof already includes intercept, b_g, add b_i and b_gi for each interaction 237 | if residualizer is not None: 238 | dof = residualizer.dof - 2*ni 239 | else: 240 | dof = phenotypes_t.shape[1] - 2 - 2*ni 241 | if nps == 1: # single phenotype case 242 | r_t = torch.matmul(X_t, b_t).squeeze() - p0_t 243 | rss_t = (r_t*r_t).sum(1) 244 | b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()] * rss_t.unsqueeze(1) / dof) 245 | b_t = b_t.squeeze(2) 246 | # r_t = tf.squeeze(tf.matmul(X_t, b_t)) - p0_t # (ng x ns x 3) x (ng x 3 x 1) 247 | # rss_t = tf.reduce_sum(tf.multiply(r_t, r_t), axis=1) 248 | # b_se_t = tf.sqrt( tf.matrix_diag_part(Xinv) * tf.expand_dims(rss_t, 1) / dof ) 249 | else: 250 | # b_t = tf.matmul(p0_tile_t, tf.matmul(Xinv, X_t, transpose_b=True), transpose_b=True) 251 | # convert to ng x np x 3?? 252 | r_t = torch.matmul(X_t, b_t) - torch.transpose(p0_tile_t, 1, 2) # (ng x ns x np) 253 | rss_t = (r_t*r_t).sum(1) # ng x np 254 | b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof) 255 | # b_se_t = tf.sqrt(tf.tile(tf.expand_dims(tf.matrix_diag_part(Xinv), 2), [1,1,nps]) * tf.tile(tf.expand_dims(rss_t, 1), [1,3,1]) / dof) # (ng x 3) -> (ng x 3 x np) 256 | 257 | tstat_t = (b_t.double() / b_se_t.double()).float() # (ng x nb x np) 258 | 259 | # tdist = tfp.distributions.StudentT(np.float64(dof), loc=np.float64(0.0), scale=np.float64(1.0)) 260 | if not return_sparse: 261 | # calculate pval 262 | # pval_t = tf.scalar_mul(2, tdist.cdf(-tf.abs(tstat_t))) # (ng x 3 x np) 263 | af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t) 264 | return tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t 265 | 266 | else: # sparse output 267 | if ni > 1: 268 | raise NotImplementedError("Sparse mode not yet supported for >1 interactions") 269 | af_t = genotypes_t.sum(1) / (2*ns) 270 | tstat_g_t = tstat_t[:,0,:] # genotypes x phenotypes 271 | tstat_i_t = tstat_t[:,1,:] 272 | tstat_gi_t = tstat_t[:,2,:] 273 | m = tstat_gi_t.abs() >= tstat_threshold 274 | tstat_g_t = tstat_g_t[m] 275 | tstat_i_t = tstat_i_t[m] 276 | tstat_gi_t = tstat_gi_t[m] 277 | ix = m.nonzero(as_tuple=False) # indexes: [genotype, phenotype] 278 | return tstat_g_t, tstat_i_t, tstat_gi_t, af_t[ix[:,0]], ix 279 | 280 | 281 | def linreg(X_t, y_t, dtype=torch.float64): 282 | """ 283 | Robust linear regression. Solves y = Xb, standardizing X. 284 | The first column of X must be the intercept. 285 | """ 286 | x_std_t = X_t.std(0) 287 | x_mean_t = X_t.mean(0) 288 | x_std_t[0] = 1 289 | x_mean_t[0] = 0 290 | 291 | # standardize X 292 | Xtilde_t = (X_t - x_mean_t) / x_std_t 293 | 294 | # regression 295 | XtX_t = torch.matmul(Xtilde_t.T, Xtilde_t) 296 | Xty_t = torch.matmul(Xtilde_t.T, y_t) 297 | b_t = torch.linalg.solve(XtX_t, Xty_t.unsqueeze(-1)) 298 | b_t = b_t.squeeze() 299 | 300 | # compute s.e. 301 | dof = X_t.shape[0] - X_t.shape[1] 302 | r_t = y_t - torch.matmul(Xtilde_t, b_t) 303 | sigma2_t = (r_t*r_t).sum() / dof 304 | XtX_inv_t = torch.linalg.solve(XtX_t, torch.eye(X_t.shape[1], dtype=dtype).to(X_t.device)) 305 | var_b_t = sigma2_t * XtX_inv_t 306 | b_se_t = torch.sqrt(torch.diag(var_b_t)) 307 | 308 | # rescale 309 | b_t /= x_std_t 310 | b_se_t /= x_std_t 311 | 312 | # adjust intercept 313 | b_t[0] -= torch.sum(x_mean_t * b_t) 314 | ms_t = x_mean_t / x_std_t 315 | b_se_t[0] = torch.sqrt(b_se_t[0]**2 + torch.matmul(torch.matmul(ms_t.T, var_b_t), ms_t)) 316 | 317 | return b_t, b_se_t 318 | 319 | 320 | def filter_covariates(covariates_t, log_counts_t, tstat_threshold=2): 321 | """ 322 | Inputs: 323 | covariates0_t: covariates matrix (samples x covariates) 324 | including genotype PCs, PEER factors, etc. 325 | ** with intercept in first column ** 326 | log_counts_t: counts vector (samples) 327 | """ 328 | assert (covariates_t[:,0] == 0).all() 329 | b_t, b_se_t = linreg(covariates_t, log_counts_t) 330 | tstat_t = b_t / b_se_t 331 | m = tstat_t.abs() > tstat_threshold 332 | m[0] = False 333 | return covariates_t[:, m] 334 | 335 | 336 | #------------------------------------------------------------------------------ 337 | # Functions for beta-approximating empirical p-values 338 | #------------------------------------------------------------------------------ 339 | def pval_from_corr(r2, dof, logp=False): 340 | tstat2 = dof * r2 / (1 - r2) 341 | return get_t_pval(np.sqrt(tstat2), dof, log=logp) 342 | 343 | 344 | def beta_shape_1_from_dof(r2, dof): 345 | """compute the Beta shape 1 parameter from moment matching""" 346 | pval = pval_from_corr(r2, dof) 347 | mean = np.mean(pval) 348 | var = np.var(pval) 349 | return mean * (mean * (1.0-mean) / var - 1.0) 350 | 351 | 352 | def beta_log_likelihood(x, shape1, shape2): 353 | """negative log-likelihood of beta distribution""" 354 | logbeta = loggamma(shape1) + loggamma(shape2) - loggamma(shape1+shape2) 355 | return (1.0-shape1)*np.sum(np.log(x)) + (1.0-shape2)*np.sum(np.log(1.0-x)) + len(x)*logbeta 356 | 357 | 358 | def fit_beta_parameters(r2_perm, dof_init, tol=1e-4, return_minp=False): 359 | """ 360 | r2_perm: array of max. r2 values from permutations 361 | dof_init: degrees of freedom 362 | """ 363 | try: 364 | # Find the degrees of freedom such that the first beta parameter is 365 | # close to 1, by finding the root where the log of the beta parameter 366 | # as a function of r2_perm and dof is 0. Optimizing log(beta shape 1) 367 | # with a parameterization of log(dof) makes this close to a linear 368 | # function. 369 | log_true_dof = scipy.optimize.newton(lambda x: np.log(beta_shape_1_from_dof(r2_perm, np.exp(x))), 370 | np.log(dof_init), tol=tol, maxiter=50) 371 | true_dof = np.exp(log_true_dof) 372 | except: 373 | # fall back to minimization 374 | print('WARNING: scipy.optimize.newton failed to converge (running scipy.optimize.minimize)') 375 | res = scipy.optimize.minimize(lambda x: np.abs(beta_shape_1_from_dof(r2_perm, x) - 1), 376 | dof_init, method='Nelder-Mead', tol=tol) 377 | true_dof = res.x[0] 378 | 379 | pval = pval_from_corr(r2_perm, true_dof) 380 | mean, var = np.mean(pval), np.var(pval) 381 | beta_shape1 = mean * (mean * (1 - mean) / var - 1) 382 | beta_shape2 = beta_shape1 * (1/mean - 1) 383 | res = scipy.optimize.minimize(lambda s: beta_log_likelihood(pval, s[0], s[1]), [beta_shape1, beta_shape2], method='Nelder-Mead', tol=tol) 384 | beta_shape1, beta_shape2 = res.x 385 | if return_minp: 386 | return beta_shape1, beta_shape2, true_dof, pval 387 | else: 388 | return beta_shape1, beta_shape2, true_dof 389 | 390 | 391 | def calculate_beta_approx_pval(r2_perm, r2_nominal, dof_init, tol=1e-4): 392 | """ 393 | r2_nominal: nominal max. r2 (scalar or array) 394 | r2_perm: array of max. r2 values from permutations 395 | dof_init: degrees of freedom 396 | """ 397 | beta_shape1, beta_shape2, true_dof = fit_beta_parameters(r2_perm, dof_init, tol) 398 | pval_true_dof = pval_from_corr(r2_nominal, true_dof) 399 | pval_beta = stats.beta.cdf(pval_true_dof, beta_shape1, beta_shape2) 400 | return pval_beta, beta_shape1, beta_shape2, true_dof, pval_true_dof 401 | 402 | #------------------------------------------------------------------------------ 403 | # i/o functions 404 | #------------------------------------------------------------------------------ 405 | 406 | def read_phenotype_bed(phenotype_bed): 407 | """Load phenotype BED file as phenotype and position DataFrames""" 408 | if phenotype_bed.lower().endswith(('.bed.gz', '.bed')): 409 | phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str}) 410 | elif phenotype_bed.lower().endswith('.bed.parquet'): 411 | phenotype_df = pd.read_parquet(phenotype_bed) 412 | phenotype_df.set_index(phenotype_df.columns[3], inplace=True) 413 | else: 414 | raise ValueError('Unsupported file type.') 415 | phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True) 416 | 417 | phenotype_df['start'] += 1 # change to 1-based 418 | pos_df = phenotype_df[['chr', 'start', 'end']] 419 | phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True) 420 | 421 | # make sure BED file is properly sorted 422 | assert pos_df.equals( 423 | pos_df.groupby('chr', sort=False, group_keys=False).apply(lambda x: x.sort_values(['start', 'end'])) 424 | ), "Positions in BED file must be sorted." 425 | 426 | if (pos_df['start'] == pos_df['end']).all(): 427 | pos_df = pos_df[['chr', 'end']].rename(columns={'end':'pos'}) 428 | 429 | return phenotype_df, pos_df 430 | -------------------------------------------------------------------------------- /tensorqtl/eigenmt.py: -------------------------------------------------------------------------------- 1 | """eigenmt.py: Re-implementation of eigenMT (Davis et al., AJHG, 2016)""" 2 | 3 | __author__ = "Francois Aguet" 4 | __copyright__ = "Copyright 2019, The Broad Institute" 5 | __license__ = "BSD3" 6 | 7 | import torch 8 | import numpy as np 9 | import pandas as pd 10 | import time 11 | import os 12 | import sys 13 | from collections import OrderedDict 14 | 15 | sys.path.insert(1, os.path.dirname(__file__)) 16 | import genotypeio 17 | from core import * 18 | 19 | 20 | def lw_shrink(X_t): 21 | """ 22 | Estimates the shrunk Ledoit-Wolf covariance matrix 23 | 24 | Args: 25 | X_t: samples x variants 26 | 27 | Returns: 28 | shrunk_cov_t: shrunk covariance 29 | shrinkage_t: shrinkage coefficient 30 | 31 | Adapted from scikit-learn: 32 | https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/covariance/shrunk_covariance_.py 33 | """ 34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 35 | 36 | if len(X_t.shape) == 2: 37 | n_samples, n_features = X_t.shape # samples x variants 38 | X_t = X_t - X_t.mean(0) 39 | X2_t = X_t.pow(2) 40 | emp_cov_trace_sum = X2_t.sum() / n_samples 41 | delta_ = torch.mm(X_t.t(), X_t).pow(2).sum() / n_samples**2 42 | beta_ = torch.mm(X2_t.t(), X2_t).sum() 43 | beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_) 44 | delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features 45 | delta /= n_features 46 | beta = torch.min(beta, delta) 47 | shrinkage_t = 0 if beta == 0 else beta / delta 48 | emp_cov_t = torch.mm(X_t.t(), X_t) / n_samples 49 | mu_t = torch.trace(emp_cov_t) / n_features 50 | shrunk_cov_t = (1. - shrinkage_t) * emp_cov_t 51 | shrunk_cov_t.view(-1)[::n_features + 1] += shrinkage_t * mu_t # add to diagonal 52 | else: # broadcast along first dimension 53 | n_samples, n_features = X_t.shape[1:] # samples x variants 54 | X_t = X_t - X_t.mean(1, keepdim=True) 55 | X2_t = X_t.pow(2) 56 | emp_cov_trace_sum = X2_t.sum([1,2]) / n_samples 57 | delta_ = torch.matmul(torch.transpose(X_t, 1, 2), X_t).pow(2).sum([1,2]) / n_samples**2 58 | beta_ = torch.matmul(torch.transpose(X2_t, 1, 2), X2_t).sum([1,2]) 59 | beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_) 60 | delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features 61 | delta /= n_features 62 | beta = torch.min(beta, delta) 63 | shrinkage_t = torch.where(beta==0, torch.zeros(beta.shape).to(device), beta/delta) 64 | emp_cov_t = torch.matmul(torch.transpose(X_t, 1, 2), X_t) / n_samples 65 | mu_t = torch.diagonal(emp_cov_t, dim1=1, dim2=2).sum(1) / n_features 66 | shrunk_cov_t = (1 - shrinkage_t.reshape([shrinkage_t.shape[0], 1, 1])) * emp_cov_t 67 | 68 | ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(X_t.shape[0])])).to(device) 69 | shrunk_cov_t.view(-1)[ix] += (shrinkage_t * mu_t).unsqueeze(-1) # add to diagonal 70 | 71 | return shrunk_cov_t, shrinkage_t 72 | 73 | 74 | def find_num_eigs(eigenvalues, variance, var_thresh=0.99): 75 | """Returns the number of eigenvalues required to reach threshold of variance explained.""" 76 | eigenvalues = np.sort(eigenvalues)[::-1] 77 | running_sum = 0 78 | counter = 0 79 | while running_sum < variance * var_thresh: 80 | running_sum += eigenvalues[counter] 81 | counter += 1 82 | return counter 83 | 84 | 85 | def compute_tests(genotypes_t, var_thresh=0.99, variant_window=200): 86 | """determine effective number of independent variants (M_eff)""" 87 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 88 | 89 | # break into windows 90 | windows = torch.split(genotypes_t, variant_window) 91 | 92 | if len(windows)>1: 93 | shrunk_cov_t, shrinkage_t = lw_shrink(torch.transpose(torch.stack(windows[:-1]), 1, 2)) 94 | 95 | n_samples, n_features = windows[0].T.shape 96 | # indices of diagonals 97 | ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(shrunk_cov_t.shape[0])])).to(device) 98 | shrunk_precision_t = torch.zeros(shrunk_cov_t.shape).to(device) 99 | shrunk_precision_t.view(-1)[ix] = shrunk_cov_t.view(-1)[ix].pow(-0.5) 100 | shrunk_cor_t = torch.matmul(torch.matmul(shrunk_precision_t, shrunk_cov_t), shrunk_precision_t) 101 | # eigenvalues_t,_ = torch.symeig(shrunk_cor_t, eigenvectors=False) # will be deprecated 102 | eigenvalues_t = torch.linalg.eigvalsh(shrunk_cor_t) # ~2x slower than symeig with 1.10.0+cu102 and 2.0.1+cu118 103 | 104 | # last window 105 | shrunk_cov0_t, shrinkage0_t = lw_shrink(windows[-1].t()) 106 | shrunk_precision0_t = torch.diag(torch.diag(shrunk_cov0_t).pow(-0.5)) 107 | shrunk_cor0_t = torch.mm(torch.mm(shrunk_precision0_t, shrunk_cov0_t), shrunk_precision0_t) 108 | # eigenvalues0_t,_ = torch.symeig(shrunk_cor0_t, eigenvectors=False) 109 | eigenvalues0_t = torch.linalg.eigvalsh(shrunk_cor0_t) 110 | 111 | if len(windows) > 1: 112 | eigenvalues = list(eigenvalues_t.cpu().numpy()) 113 | eigenvalues.append(eigenvalues0_t.cpu().numpy()) 114 | else: 115 | eigenvalues = [eigenvalues0_t.cpu().numpy()] 116 | 117 | m_eff = 0 118 | for ev,m in zip(eigenvalues, [i.shape[0] for i in windows]): 119 | ev[ev < 0] = 0 120 | m_eff += find_num_eigs(ev, m, var_thresh=var_thresh) 121 | 122 | return m_eff 123 | 124 | 125 | 126 | def run_eigenmt(genotype_df, variant_df, phenotype_df, phenotype_pos_df, interaction_s=None, 127 | maf_threshold=0, var_thresh=0.99, variant_window=200, window=1000000, verbose=True, logger=None): 128 | """Standalone function for computing eigenMT correction. 129 | 130 | Returns the number of tests for each gene 131 | """ 132 | 133 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 134 | 135 | if logger is None: 136 | logger = SimpleLogger() 137 | 138 | logger.write('eigenMT: estimating number of independent variants tested for each phenotype') 139 | 140 | logger.write('cis-QTL mapping: empirical p-values for phenotypes') 141 | logger.write(f' * {phenotype_df.shape[1]} samples') 142 | logger.write(f' * {phenotype_df.shape[0]} phenotypes') 143 | logger.write(f' * {genotype_df.shape[0]} variants') 144 | 145 | if interaction_s is not None and maf_threshold > 0: 146 | interaction_mask_t = torch.BoolTensor(interaction_s >= interaction_s.median()).to(device) 147 | else: 148 | interaction_mask_t = None 149 | 150 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) 151 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device) 152 | 153 | igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window) 154 | start_time = time.time() 155 | m_eff = OrderedDict() 156 | for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1): 157 | 158 | # copy genotypes to GPU 159 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 160 | genotypes_t = genotypes_t[:,genotype_ix_t] 161 | impute_mean(genotypes_t) 162 | 163 | if interaction_s is None: 164 | mask_t = calculate_maf(genotypes_t) >= maf_threshold 165 | genotypes_t = genotypes_t[mask_t] 166 | else: 167 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold) 168 | 169 | m_eff[phenotype_id] = compute_tests(genotypes_t, var_thresh=var_thresh, variant_window=variant_window) 170 | 171 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') 172 | return pd.Series(m_eff) 173 | 174 | 175 | def padjust_bh(p): 176 | """Benjamini-Hochberg adjusted p-values""" 177 | if not np.all(np.isfinite(p)): 178 | raise ValueError('P values must be finite.') 179 | n = len(p) 180 | i = np.arange(n,0,-1) 181 | o = np.argsort(p)[::-1] 182 | ro = np.argsort(o) 183 | return np.minimum(1, np.minimum.accumulate(np.float64(n)/i * np.array(p)[o]))[ro] 184 | -------------------------------------------------------------------------------- /tensorqtl/genotypeio.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import tempfile 3 | import numpy as np 4 | import subprocess 5 | import os 6 | import gzip 7 | import sys 8 | import threading 9 | import queue 10 | import bisect 11 | from pandas_plink import read_plink 12 | 13 | sys.path.insert(1, os.path.dirname(__file__)) 14 | from core import * 15 | 16 | try: 17 | import pgen 18 | except ImportError as e: 19 | pgen = None 20 | 21 | 22 | gt_to_dosage_dict = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan, 23 | '0|0':0, '0|1':1, '1|0':1, '1|1':2, '.|.':np.nan} 24 | 25 | 26 | def _check_dependency(name): 27 | e = subprocess.call(f"which {name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 28 | if e != 0: 29 | raise RuntimeError(f"External dependency '{name}' not installed") 30 | 31 | 32 | def print_progress(k, n, entity): 33 | s = f'\r processing {entity} {k}/{n}' 34 | if k == n: 35 | s += '\n' 36 | sys.stdout.write(s) 37 | sys.stdout.flush() 38 | 39 | 40 | class BackgroundGenerator(threading.Thread): 41 | # Adapted from https://github.com/justheuristic/prefetch_generator 42 | def __init__(self, generator, max_prefetch=10): 43 | threading.Thread.__init__(self) 44 | self.queue = queue.Queue(max_prefetch) 45 | self.generator = generator 46 | self.daemon = True 47 | self.start() 48 | 49 | def run(self): 50 | try: 51 | for item in self.generator: 52 | self.queue.put(item) 53 | except Exception as exception: 54 | self.queue.put(exception) 55 | self.queue.put(None) 56 | 57 | def next(self): 58 | next_item = self.queue.get() 59 | if next_item is None: 60 | self.join() 61 | raise StopIteration 62 | if isinstance(next_item, Exception): 63 | self.join() 64 | raise next_item 65 | return next_item 66 | 67 | def __next__(self): 68 | return self.next() 69 | 70 | def __iter__(self): 71 | return self 72 | 73 | class background: 74 | def __init__(self, max_prefetch=10): 75 | self.max_prefetch = max_prefetch 76 | def __call__(self,gen): 77 | def bg_generator(*args,**kwargs): 78 | return BackgroundGenerator(gen(*args,**kwargs), max_prefetch=self.max_prefetch) 79 | return bg_generator 80 | 81 | 82 | #------------------------------------------------------------------------------ 83 | # Functions for writing VCFs 84 | #------------------------------------------------------------------------------ 85 | def _get_vcf_opener(vcfpath): 86 | if vcfpath.endswith('.vcf.gz'): 87 | return gzip.open(vcfpath, 'rt') 88 | else: 89 | return open(vcfpath) 90 | 91 | 92 | def get_sample_ids(vcfpath): 93 | """Get sample IDs from VCF""" 94 | with _get_vcf_opener(vcfpath) as vcf: 95 | for header in vcf: 96 | if header[:2] == '##': continue 97 | break 98 | return header.strip().split('\t')[9:] 99 | 100 | 101 | def parse_genotypes(x, field='GT'): 102 | """Convert list of genotypes (str) to np.float32""" 103 | if field == 'GT': 104 | g = np.float32([gt_to_dosage_dict[i] for i in x]) 105 | elif field == 'DS': 106 | g = np.float32(x) 107 | return g 108 | 109 | 110 | def _get_field_ix(line, field): 111 | """Get position of field ('GT' or 'DS') in FORMAT""" 112 | fmt = line[8].split(':') 113 | if field not in fmt: 114 | raise ValueError(f'FORMAT field does not contain {field}') 115 | return fmt.index(field) 116 | 117 | #------------------------------------------------------------------------------ 118 | # Functions for loading regions/variants from VCFs 119 | #------------------------------------------------------------------------------ 120 | def _impute_mean(g, missing=-9, verbose=False): 121 | """Impute rows to mean (in place)""" 122 | if not g.dtype in [np.float32, np.float64]: 123 | raise ValueError('Input dtype must be np.float32 or np.float64') 124 | n = 0 125 | for i in np.where((g == missing).any(1))[0]: 126 | ix = g[i] == missing 127 | g[i][ix] = np.mean(g[i][~ix]) 128 | n += 1 129 | if verbose and n > 0: 130 | print(f' imputed at least 1 sample in {n}/{g.shape[0]} sites') 131 | 132 | 133 | class PlinkReader(object): 134 | def __init__(self, plink_prefix_path, select_samples=None, include_variants=None, 135 | exclude_variants=None, exclude_chrs=None, verbose=True, dtype=np.int8): 136 | """ 137 | Class for reading genotypes from PLINK bed files 138 | 139 | plink_prefix_path: prefix to PLINK bed,bim,fam files 140 | select_samples: specify a subset of samples 141 | 142 | Notes: 143 | Use this command to convert a VCF to PLINK format: 144 | plink2 --make-bed \ 145 | --output-chr chrM \ 146 | --vcf ${plink_prefix_path}.vcf.gz \ 147 | --out ${plink_prefix_path} 148 | 149 | If using plink v1, the --keep-allele-order flag must be included. 150 | 151 | Uses read_plink from pandas_plink. 152 | """ 153 | 154 | self.bim, self.fam, self.bed = read_plink(plink_prefix_path, verbose=verbose) 155 | self.bed = 2 - self.bed # flip allele order: PLINK uses REF as effect allele 156 | if dtype == np.int8: 157 | self.bed[np.isnan(self.bed)] = -9 # convert missing (NaN) to -9 for int8 158 | self.bed = self.bed.astype(dtype, copy=False) 159 | self.sample_ids = self.fam['iid'].tolist() 160 | if select_samples is not None: 161 | ix = [self.sample_ids.index(i) for i in select_samples] 162 | self.fam = self.fam.loc[ix] 163 | self.bed = self.bed[:,ix] 164 | self.sample_ids = self.fam['iid'].tolist() 165 | if include_variants is not None: 166 | m = self.bim['snp'].isin(include_variants).values 167 | self.bed = self.bed[m,:] 168 | self.bim = self.bim[m] 169 | self.bim.reset_index(drop=True, inplace=True) 170 | self.bim['i'] = self.bim.index 171 | if exclude_variants is not None: 172 | m = ~self.bim['snp'].isin(exclude_variants).values 173 | self.bed = self.bed[m,:] 174 | self.bim = self.bim[m] 175 | self.bim.reset_index(drop=True, inplace=True) 176 | self.bim['i'] = self.bim.index 177 | if exclude_chrs is not None: 178 | m = ~self.bim['chrom'].isin(exclude_chrs).values 179 | self.bed = self.bed[m,:] 180 | self.bim = self.bim[m] 181 | self.bim.reset_index(drop=True, inplace=True) 182 | self.bim['i'] = self.bim.index 183 | self.n_samples = self.fam.shape[0] 184 | self.chrs = list(self.bim['chrom'].unique()) 185 | self.variant_pos = {i:g['pos'] for i,g in self.bim.set_index('snp')[['chrom', 'pos']].groupby('chrom')} 186 | self.variant_pos_dict = self.bim.set_index('snp')['pos'].to_dict() 187 | 188 | def get_region_index(self, region_str, return_pos=False): 189 | s = region_str.split(':') 190 | chrom = s[0] 191 | c = self.bim[self.bim['chrom'] == chrom] 192 | if len(s) > 1: 193 | start, end = s[1].split('-') 194 | start = int(start) 195 | end = int(end) 196 | c = c[(c['pos'] >= start) & (c['pos'] <= end)] 197 | if return_pos: 198 | return c['i'].values, c.set_index('snp')['pos'] 199 | else: 200 | return c['i'].values 201 | 202 | def get_region(self, region_str, sample_ids=None, impute=False, verbose=False, dtype=np.int8): 203 | """Get genotypes for a region defined by 'chr:start-end' or 'chr'""" 204 | ix, pos_s = self.get_region_index(region_str, return_pos=True) 205 | g = self.bed[ix, :].compute().astype(dtype) 206 | if sample_ids is not None: 207 | ix = [self.sample_ids.index(i) for i in sample_ids] 208 | g = g[:, ix] 209 | if impute: 210 | _impute_mean(g, verbose=verbose) 211 | return g, pos_s 212 | 213 | def get_genotypes(self, variant_ids, sample_ids=None, impute=False, verbose=False, dtype=np.int8): 214 | """Load genotypes for selected variant IDs""" 215 | c = self.bim[self.bim['snp'].isin(variant_ids)] 216 | g = self.bed[c.i.values, :].compute().astype(dtype) 217 | if sample_ids is not None: 218 | ix = [self.sample_ids.index(i) for i in sample_ids] 219 | g = g[:, ix] 220 | if impute: 221 | _impute_mean(g, verbose=verbose) 222 | return g, c.set_index('snp')['pos'] 223 | 224 | def get_genotype(self, variant_id, sample_ids=None, impute=False, verbose=False, dtype=np.int8): 225 | """Load genotypes for a single variant ID as pd.Series""" 226 | g,_ = self.get_genotypes([variant_id], sample_ids=sample_ids, impute=impute, verbose=verbose, dtype=dtype) 227 | if sample_ids is None: 228 | return pd.Series(g[0], index=self.fam['iid'], name=variant_id) 229 | else: 230 | return pd.Series(g[0], index=sample_ids, name=variant_id) 231 | 232 | def load_genotypes(self): 233 | """Load all genotypes into memory, as pd.DataFrame""" 234 | return pd.DataFrame(self.bed.compute(), index=self.bim['snp'], columns=self.fam['iid']) 235 | 236 | 237 | def load_genotypes(genotype_path, select_samples=None, dosages=False): 238 | """Load all genotypes into a dataframe""" 239 | if all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]): 240 | if pgen is None: 241 | raise ImportError('Pgenlib must be installed to use PLINK 2 pgen/psam/pvar files.') 242 | pgr = pgen.PgenReader(genotype_path, select_samples=select_samples) 243 | variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']] 244 | if dosages: 245 | genotype_df = pgr.load_dosages() 246 | else: 247 | genotype_df = pgr.load_genotypes() 248 | elif all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['bed', 'bim', 'fam']]): 249 | pr = PlinkReader(genotype_path, select_samples=select_samples, dtype=np.int8) 250 | genotype_df = pr.load_genotypes() 251 | variant_df = pr.bim.set_index('snp')[['chrom', 'pos']] 252 | elif genotype_path.endswith(('.bed.parquet', '.bed.gz', '.bed')): 253 | genotype_df, variant_df = read_phenotype_bed(genotype_path) 254 | assert variant_df.columns[1] == 'pos', "The BED file must define a single position for each variant, with start + 1 == end." 255 | variant_df.columns = ['chrom', 'pos'] 256 | elif genotype_path.endswith('.parquet'): 257 | genotype_df = pd.read_parquet(genotype_path) 258 | variant_df = None 259 | elif genotype_path.endswith('.gz'): 260 | with gzip.open(genotype_path, 'rt') as f: 261 | header = f.readline().strip().split('\t') 262 | dtypes = {i:np.float32 for i in header} 263 | dtypes[header[0]] = str 264 | genotype_df = pd.read_csv(genotype_path, sep='\t', index_col=0, dtype=dtypes) 265 | variant_df = None 266 | else: 267 | raise ValueError(f"Failed to load genotypes from {genotype_path}. Supported formats: pgen/psam/pvar, bed/bim/fam, parquet, tsv.gz") 268 | return genotype_df, variant_df 269 | 270 | 271 | def get_vcf_region(region_str, vcfpath, field='GT', sample_ids=None, select_samples=None, impute_missing=True): 272 | """Load VCF region (str: 'chr:start-end') as DataFrame (requires tabix)""" 273 | s = subprocess.check_output(f'tabix {vcfpath} {region_str}', shell=True) 274 | s = s.decode().strip().split('\n') 275 | s = [i.split('\t') for i in s] 276 | 277 | if sample_ids is None: 278 | sample_ids = get_sample_ids(vcfpath) 279 | variant_ids = [i[2] for i in s] 280 | pos_s = pd.Series([int(i[1]) for i in s], index=variant_ids) 281 | 282 | ix = _get_field_ix(s[0], field) 283 | g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s]) 284 | df = pd.DataFrame(g, index=variant_ids, columns=sample_ids) 285 | 286 | if select_samples is not None: 287 | df = df[select_samples] 288 | 289 | if impute_missing: 290 | n = 0 291 | for v in df.values: 292 | m = np.isnan(v) 293 | if np.any(m): 294 | v[m] = np.mean(v[~m]) 295 | n += 1 296 | if n > 0: 297 | print(f' imputed at least 1 sample in {n} sites') 298 | 299 | return df, pos_s 300 | 301 | 302 | def get_vcf_variants(variant_ids, vcfpath, field='GT', sample_ids=None): 303 | """Load a set of variants in VCF as DataFrame (requires tabix)""" 304 | variant_id_set = set(variant_ids) 305 | with tempfile.NamedTemporaryFile() as regions_file: 306 | df = pd.DataFrame([i.split('_')[:2] for i in variant_id_set], columns=['chr', 'pos']) 307 | df['pos'] = df['pos'].astype(int) 308 | df = df.sort_values(['chr', 'pos']) 309 | df.to_csv(regions_file.name, sep='\t', index=False, header=False) 310 | s = subprocess.check_output(f'tabix {vcfpath} --regions {regions_file.name}', shell=True) 311 | s = s.decode().strip().split('\n') 312 | s = [i.split('\t') for i in s] 313 | 314 | if sample_ids is None: 315 | sample_ids = get_sample_ids(vcfpath) 316 | 317 | ix = _get_field_ix(s[0], field) 318 | g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s]) 319 | g = np.array([i for i in g if -1 not in i]) # filter missing here instead of ValueError? 320 | 321 | returned_variant_ids = [i[2] for i in s] 322 | ix = [k for k,i in enumerate(returned_variant_ids) if i in variant_id_set] 323 | g = np.array([g[i] for i in ix]) 324 | returned_variant_ids = [returned_variant_ids[i] for i in ix] 325 | return pd.DataFrame(g.astype(np.float32), index=returned_variant_ids, columns=sample_ids) 326 | 327 | #------------------------------------------------------------------------------ 328 | # Generator classes for batch processing of genotypes/phenotypes 329 | #------------------------------------------------------------------------------ 330 | class GenotypeGeneratorTrans(object): 331 | def __init__(self, genotype_df, batch_size=50000, chr_s=None): 332 | """ 333 | Generator for iterating over all variants (trans-scan) 334 | 335 | Inputs: 336 | genotype_df: Dataframe with genotypes (variants x samples) 337 | batch_size: Batch size for GPU processing 338 | 339 | Generates: genotype array (2D), variant ID array 340 | """ 341 | self.genotype_df = genotype_df 342 | self.batch_size = batch_size 343 | self.num_batches = int(np.ceil(self.genotype_df.shape[0] / batch_size)) 344 | self.batch_indexes = [[i*batch_size, (i+1)*batch_size] for i in range(self.num_batches)] 345 | self.batch_indexes[-1][1] = self.genotype_df.shape[0] 346 | if chr_s is not None: 347 | chroms, chr_ix = np.unique(chr_s, return_index=True) 348 | s = np.argsort(chr_ix) 349 | self.chroms = chroms[s] 350 | chr_ix = list(chr_ix[s]) + [chr_s.shape[0]] 351 | size_s = pd.Series(np.diff(chr_ix), index=self.chroms) 352 | self.chr_batch_indexes = {} 353 | for k,c in enumerate(self.chroms): 354 | num_batches = int(np.ceil(size_s[c] / batch_size)) 355 | batch_indexes = [[chr_ix[k]+i*batch_size, chr_ix[k]+(i+1)*batch_size] for i in range(num_batches)] 356 | batch_indexes[-1][1] = chr_ix[k+1] 357 | self.chr_batch_indexes[c] = batch_indexes 358 | 359 | def __len__(self): 360 | return self.num_batches 361 | 362 | @background(max_prefetch=6) 363 | def generate_data(self, chrom=None, verbose=False, enum_start=1): 364 | """Generate batches from genotype data""" 365 | if chrom is None: 366 | batch_indexes = self.batch_indexes 367 | num_batches = self.num_batches 368 | else: 369 | batch_indexes = self.chr_batch_indexes[chrom] 370 | num_batches = np.sum([len(i) for i in self.chr_batch_indexes.values()]) 371 | 372 | for k,i in enumerate(batch_indexes, enum_start): # loop through batches 373 | if verbose: 374 | print_progress(k, num_batches, 'batch') 375 | g = self.genotype_df.values[i[0]:i[1]] 376 | ix = self.genotype_df.index[i[0]:i[1]] # variant IDs 377 | yield g, ix 378 | 379 | 380 | def get_cis_ranges(phenotype_pos_df, chr_variant_dfs, window, verbose=True): 381 | """ 382 | 383 | start, end indexes (inclusive) 384 | """ 385 | # check phenotypes & calculate genotype ranges 386 | # get genotype indexes corresponding to cis-window of each phenotype 387 | if 'pos' in phenotype_pos_df: 388 | phenotype_pos_df = phenotype_pos_df.rename(columns={'pos':'start'}) 389 | phenotype_pos_df['end'] = phenotype_pos_df['start'] 390 | phenotype_pos_dict = phenotype_pos_df.to_dict(orient='index') 391 | 392 | drop_ids = [] 393 | cis_ranges = {} 394 | n = len(phenotype_pos_df) 395 | for k, phenotype_id in enumerate(phenotype_pos_df.index, 1): 396 | if verbose and (k % 1000 == 0 or k == n): 397 | print(f'\r * checking phenotypes: {k}/{n}', end='' if k != n else None) 398 | 399 | pos = phenotype_pos_dict[phenotype_id] 400 | chrom = pos['chr'] 401 | m = len(chr_variant_dfs[chrom]['pos'].values) 402 | lb = bisect.bisect_left(chr_variant_dfs[chrom]['pos'].values, pos['start'] - window) 403 | ub = bisect.bisect_right(chr_variant_dfs[chrom]['pos'].values, pos['end'] + window) 404 | if lb != ub: 405 | r = chr_variant_dfs[chrom]['index'].values[[lb, ub - 1]] 406 | else: 407 | r = [] 408 | 409 | if len(r) > 0: 410 | cis_ranges[phenotype_id] = r 411 | else: 412 | drop_ids.append(phenotype_id) 413 | 414 | return cis_ranges, drop_ids 415 | 416 | 417 | class InputGeneratorCis(object): 418 | """ 419 | Input generator for cis-mapping 420 | 421 | Inputs: 422 | genotype_df: genotype DataFrame (genotypes x samples) 423 | variant_df: DataFrame mapping variant_id (index) to chrom, pos 424 | phenotype_df: phenotype DataFrame (phenotypes x samples) 425 | phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end'] 426 | window: cis-window; selects variants within +- cis-window from 'pos' (e.g., TSS for gene-based features) 427 | or within [start-window, end+window] if 'start' and 'end' are present in phenotype_pos_df 428 | 429 | Generates: phenotype array, genotype array (2D), cis-window indices, phenotype ID 430 | """ 431 | def __init__(self, genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=1000000): 432 | assert (genotype_df.index == variant_df.index).all() 433 | assert (phenotype_df.index == phenotype_df.index.unique()).all() 434 | self.genotype_df = genotype_df 435 | self.variant_df = variant_df.copy() 436 | self.variant_df['index'] = np.arange(variant_df.shape[0]) 437 | self.n_samples = phenotype_df.shape[1] 438 | 439 | # drop phenotypes without genotypes on same contig 440 | variant_chrs = variant_df['chrom'].unique() 441 | phenotype_chrs = phenotype_pos_df['chr'].unique() 442 | self.chrs = [i for i in phenotype_chrs if i in variant_chrs] 443 | m = phenotype_pos_df['chr'].isin(self.chrs) 444 | if any(~m): 445 | print(f' ** dropping {sum(~m)} phenotypes on chrs. without genotypes') 446 | self.phenotype_df = phenotype_df[m] 447 | self.phenotype_pos_df = phenotype_pos_df[m] 448 | 449 | # check for constant phenotypes and drop 450 | m = np.all(self.phenotype_df.values == self.phenotype_df.values[:,[0]], 1) 451 | if m.any(): 452 | print(f' ** dropping {np.sum(m)} constant phenotypes') 453 | self.phenotype_df = self.phenotype_df.loc[~m] 454 | self.phenotype_pos_df = self.phenotype_pos_df.loc[~m] 455 | 456 | if len(self.phenotype_df) == 0: 457 | raise ValueError("No phenotypes remain after filters.") 458 | 459 | self.group_s = None 460 | self.window = window 461 | 462 | self.chr_variant_dfs = {c:g[['pos', 'index']] for c,g in self.variant_df.groupby('chrom')} 463 | 464 | # check phenotypes & calculate genotype ranges 465 | # get genotype indexes corresponding to cis-window of each phenotype 466 | self.cis_ranges, drop_ids = get_cis_ranges(self.phenotype_pos_df, self.chr_variant_dfs, self.window) 467 | if len(drop_ids) > 0: 468 | print(f" ** dropping {len(drop_ids)} phenotypes without variants in cis-window") 469 | self.phenotype_df = self.phenotype_df.drop(drop_ids) 470 | self.phenotype_pos_df = self.phenotype_pos_df.drop(drop_ids) 471 | if 'pos' in self.phenotype_pos_df: 472 | self.phenotype_start = self.phenotype_pos_df['pos'].to_dict() 473 | self.phenotype_end = self.phenotype_start 474 | else: 475 | self.phenotype_start = self.phenotype_pos_df['start'].to_dict() 476 | self.phenotype_end = self.phenotype_pos_df['end'].to_dict() 477 | self.n_phenotypes = self.phenotype_df.shape[0] 478 | 479 | if group_s is not None: 480 | self.group_s = group_s.loc[self.phenotype_df.index].copy() 481 | self.n_groups = self.group_s.unique().shape[0] 482 | 483 | 484 | @background(max_prefetch=6) 485 | def generate_data(self, chrom=None, verbose=False): 486 | """ 487 | Generate batches from genotype data 488 | 489 | Returns: phenotype array, genotype matrix, genotype index, phenotype ID(s), [group ID] 490 | """ 491 | if chrom is None: 492 | phenotype_ids = self.phenotype_df.index 493 | chr_offset = 0 494 | else: 495 | phenotype_ids = self.phenotype_pos_df[self.phenotype_pos_df['chr'] == chrom].index 496 | if self.group_s is None: 497 | offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'], return_index=True))} 498 | else: 499 | offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'][self.group_s.drop_duplicates().index], return_index=True))} 500 | chr_offset = offset_dict[chrom] 501 | 502 | index_dict = {j:i for i,j in enumerate(self.phenotype_df.index)} 503 | 504 | if self.group_s is None: 505 | for k,phenotype_id in enumerate(phenotype_ids, chr_offset+1): 506 | if verbose: 507 | print_progress(k, self.n_phenotypes, 'phenotype') 508 | p = self.phenotype_df.values[index_dict[phenotype_id]] 509 | # p = self.phenotype_df.values[k] 510 | r = self.cis_ranges[phenotype_id] 511 | yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), phenotype_id 512 | else: 513 | gdf = self.group_s[phenotype_ids].groupby(self.group_s, sort=False) 514 | for k,(group_id,g) in enumerate(gdf, chr_offset+1): 515 | if verbose: 516 | print_progress(k, self.n_groups, 'phenotype group') 517 | # check that ranges are the same for all phenotypes within group 518 | assert np.all([self.cis_ranges[g.index[0]][0] == self.cis_ranges[i][0] and self.cis_ranges[g.index[0]][1] == self.cis_ranges[i][1] for i in g.index[1:]]) 519 | group_phenotype_ids = g.index.tolist() 520 | # p = self.phenotype_df.loc[group_phenotype_ids].values 521 | p = self.phenotype_df.values[[index_dict[i] for i in group_phenotype_ids]] 522 | r = self.cis_ranges[g.index[0]] 523 | yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), group_phenotype_ids, group_id 524 | 525 | 526 | def get_chunk_size(memory_gb, samples): 527 | """""" 528 | return memory_gb * 1024**3 // samples 529 | 530 | 531 | def generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, chunk_size, window=1000000, 532 | dosages=False, verbose=True): 533 | """ 534 | Generate paired genotype-phenotype chunks for large datasets where only a subset of 535 | genotypes can be loaded into memory. 536 | 537 | pgr: pgen.PgenReader 538 | phenotype_df: phenotype DataFrame (phenotypes x samples) 539 | phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end'] 540 | chunk_size: maximum number of variants to load into CPU memory 541 | window: cis-window 542 | dosages: load dosages (DS) from genotype files (default: GT) 543 | """ 544 | variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']] 545 | cis_ranges, _ = get_cis_ranges(phenotype_pos_df, pgr.variant_dfs, window) 546 | range_df = pd.DataFrame(cis_ranges, index=['start', 'end']).T 547 | range_df = range_df.join(phenotype_pos_df['chr']) 548 | 549 | if chunk_size == 'chr': 550 | chrlen_s = range_df['chr'].value_counts(sort=False) 551 | start_ixs = [0] + chrlen_s.cumsum().tolist() 552 | else: 553 | chunk_size = int(chunk_size) 554 | # check chunk size 555 | max_cis_var = (range_df['end'] - range_df['start'] + 1).max() 556 | if not max_cis_var <= chunk_size: 557 | raise ValueError(f"Max. chunk size must be at least largest cis-window ({max_cis_var})") 558 | 559 | start_ixs = [0] 560 | while start_ixs[-1] < range_df.shape[0]: 561 | end_ix = bisect.bisect_left(range_df['end'].values, range_df['start'].values[start_ixs[-1]] + chunk_size) 562 | start_ixs.append(end_ix) 563 | start_ixs[-1] = range_df.shape[0] 564 | 565 | nchunks = len(start_ixs) - 1 566 | for ci in range(nchunks): 567 | if verbose: 568 | print(f"Processing genotype-phenotype chunk {ci+1}/{nchunks}") 569 | ix = slice(start_ixs[ci], start_ixs[ci+1]) 570 | chunk_df = range_df[ix] 571 | if chunk_size == 'chr': 572 | assert (chunk_df['chr'] == chrlen_s.index[ci]).all() 573 | if dosages: 574 | gt_df = pgr.read_dosages_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], dtype=np.float32) 575 | else: 576 | gt_df = pgr.read_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], impute_mean=False, dtype=np.int8) 577 | var_df = variant_df.iloc[chunk_df['start'].values[0]:chunk_df['end'].values[-1]+1] 578 | yield gt_df, var_df, phenotype_df[ix], phenotype_pos_df[ix], ci 579 | -------------------------------------------------------------------------------- /tensorqtl/mixqtl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import sys 5 | sys.path.insert(1, os.path.dirname(__file__)) 6 | import cis 7 | from core import * 8 | 9 | 10 | def trc(genotypes_t, counts_t, covariates_t=None, select_covariates=True, 11 | count_threshold=0, imputation='offset', mode='standard', return_af=False): 12 | """ 13 | Inputs 14 | genotypes_t: dosages (variants x samples) 15 | counts_t: DESeq size factor-normalized read counts 16 | covariates_t: covariates matrix, first column must be intercept 17 | mode: if 'standard', parallel regression for each variant in genotypes_t 18 | if 'multi', multiple regression for all variants in genotypes_t 19 | 20 | Outputs: 21 | t-statistic, beta, beta_se {af, ma_samples, ma_counts} (mode='standard') 22 | beta, beta_se (mode='multi') 23 | """ 24 | nonzero_t = counts_t != 0 25 | 26 | if imputation == 'offset': 27 | log_counts_t = counts_t.log1p() 28 | elif imputation == 'half_min': 29 | log_counts_t = counts_t.clone() 30 | log_counts_t[~nonzero_t] = log_counts_t[nonzero_t].min() / 2 31 | log_counts_t = log_counts_t.log() 32 | 33 | if covariates_t is not None: 34 | if select_covariates: 35 | # select significant covariates 36 | b_t, b_se_t = linreg(covariates_t[nonzero_t, :], log_counts_t[nonzero_t], dtype=torch.float32) 37 | tstat_t = b_t / b_se_t 38 | m = tstat_t.abs() > 2 39 | m[0] = True # keep intercept 40 | sel_covariates_t = covariates_t[:, m] 41 | else: 42 | sel_covariates_t = covariates_t 43 | 44 | # Regress out covariates from non-zero counts, and keep zeros. 45 | # This follows the original mixQTL implementation, but may be 46 | # problematic when count_threshold is 0. 47 | residualizer = Residualizer(sel_covariates_t[nonzero_t, 1:]) # exclude intercept 48 | y_t = counts_t.clone() 49 | y_t[nonzero_t] = residualizer.transform(log_counts_t[nonzero_t].reshape(1,-1), center=True) 50 | else: 51 | y_t = log_counts_t 52 | 53 | m_t = counts_t >= count_threshold 54 | 55 | if mode == 'standard': 56 | res = cis.calculate_cis_nominal(genotypes_t[:, m_t] / 2, y_t[m_t], return_af=False) 57 | if return_af: 58 | af, ma_samples, ma_counts = get_allele_stats(genotypes_t) 59 | return *res, af, ma_samples, ma_counts 60 | else: 61 | return res 62 | 63 | elif mode.startswith('multi'): 64 | X_t = torch.cat([torch.ones([m_t.sum(), 1], dtype=bool).to(genotypes_t.device), genotypes_t[:, m_t].T / 2], axis=1) 65 | b_t, b_se_t = linreg(X_t, y_t[m_t], dtype=torch.float32) 66 | return b_t[1:], b_se_t[1:] 67 | -------------------------------------------------------------------------------- /tensorqtl/pgen.py: -------------------------------------------------------------------------------- 1 | # Functions for reading dosages from PLINK pgen files based on the Pgenlib Python API: 2 | # https://github.com/chrchang/plink-ng/blob/master/2.0/Python/python_api.txt 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pgenlib as pg 7 | import os 8 | import bisect 9 | 10 | 11 | def read_pvar(pvar_path): 12 | """Read pvar file as pd.DataFrame""" 13 | return pd.read_csv(pvar_path, sep='\t', comment='#', 14 | names=['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'], 15 | dtype={'chrom':str, 'pos':np.int32, 'id':str, 'ref':str, 'alt':str, 16 | 'qual':str, 'filter':str, 'info':str}) 17 | 18 | 19 | def read_psam(psam_path): 20 | """Read psam file as pd.DataFrame""" 21 | psam_df = pd.read_csv(psam_path, sep='\t', index_col=0) 22 | psam_df.index = psam_df.index.astype(str) 23 | return psam_df 24 | 25 | 26 | def hardcall_phase_present(pgen_path): 27 | """Returns True iff phased hardcalls may be present""" 28 | with pg.PgenReader(pgen_path.encode()) as r: 29 | return r.hardcall_phase_present() 30 | 31 | 32 | def get_reader(pgen_path, sample_subset=None): 33 | """""" 34 | if sample_subset is not None: 35 | sample_subset = np.array(sample_subset, dtype=np.uint32) 36 | reader = pg.PgenReader(pgen_path.encode(), sample_subset=sample_subset) 37 | if sample_subset is None: 38 | num_samples = reader.get_raw_sample_ct() 39 | else: 40 | num_samples = len(sample_subset) 41 | return reader, num_samples 42 | 43 | 44 | def read(pgen_path, variant_idx, sample_subset=None, dtype=np.int8): 45 | """ 46 | Get genotypes for a variant. 47 | 48 | Parameters 49 | ---------- 50 | pgen_path : str 51 | Path of PLINK 2 pgen file 52 | variant_idx : int 53 | Variant index 54 | sample_subset : array_like 55 | List of sample indexes to select. Must be sorted. 56 | dtype : np.int{8,32,64} 57 | Data type of the returned array. 58 | 59 | Returns 60 | ------- 61 | dosages : ndarray 62 | Genotypes (as {0, 1, 2, -9}) for the selected variant and samples. 63 | """ 64 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 65 | genotypes = np.zeros(num_samples, dtype=dtype) 66 | with reader as r: 67 | r.read(np.array(variant_idx, dtype=np.uint32), genotypes) 68 | return genotypes 69 | 70 | 71 | def read_dosages(pgen_path, variant_idx, sample_subset=None, dtype=np.float32): 72 | """ 73 | Get dosages for a variant. 74 | 75 | Parameters 76 | ---------- 77 | pgen_path : str 78 | Path of PLINK 2 pgen file 79 | variant_idx : int 80 | Variant index 81 | sample_subset : array_like 82 | List of sample indexes to select. Must be sorted. 83 | dtype : np.float{32,64} 84 | Data type of the returned array. 85 | 86 | Returns 87 | ------- 88 | dosages : ndarray 89 | Genotype dosages for the selected variant and samples. 90 | """ 91 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 92 | dosages = np.zeros(num_samples, dtype=dtype) 93 | with reader as r: 94 | r.read_dosages(np.array(variant_idx, dtype=np.uint32), dosages) 95 | return dosages 96 | 97 | 98 | def read_alleles(pgen_path, variant_idx, sample_subset=None): 99 | """ 100 | Get alleles for a variant. 101 | 102 | Parameters 103 | ---------- 104 | pgen_path : str 105 | Path of PLINK 2 pgen file 106 | variant_idx : int 107 | Variant index 108 | sample_subset : array_like 109 | List of sample indexes to select. Must be sorted. 110 | 111 | Returns 112 | ------- 113 | alleles: ndarray (2 * sample_ct) 114 | Alleles for the selected variant and samples. 115 | Elements 2n and 2n+1 correspond to sample n. 116 | Both elements are -9 for missing genotypes. 117 | If the genotype is unphased, the lower index appears first. 118 | """ 119 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 120 | alleles = np.zeros(2*num_samples, dtype=np.int32) 121 | with reader as r: 122 | r.read_alleles(np.array(variant_idx, dtype=np.uint32), alleles) 123 | return alleles 124 | 125 | 126 | def read_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.int8): 127 | """ 128 | Get genotypes for a list of variants. 129 | 130 | Parameters 131 | ---------- 132 | pgen_path : str 133 | Path of PLINK 2 pgen file 134 | variant_idxs : array_like 135 | List of variant indexes 136 | sample_subset : array_like 137 | List of sample indexes to select. Must be sorted. 138 | dtype : np.int{8,32,64} 139 | Data type of the returned array. 140 | 141 | Returns 142 | ------- 143 | dosages : ndarray 144 | Genotypes for the selected variants and samples. 145 | """ 146 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 147 | num_variants = len(variant_idxs) 148 | genotypes = np.zeros([num_variants, num_samples], dtype=dtype) 149 | with reader as r: 150 | r.read_list(np.array(variant_idxs, dtype=np.uint32), genotypes) 151 | return genotypes 152 | 153 | 154 | def read_dosages_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.float32): 155 | """ 156 | Get dosages for a list of variants. 157 | 158 | Parameters 159 | ---------- 160 | pgen_path : str 161 | Path of PLINK 2 pgen file 162 | variant_idxs : array_like 163 | List of variant indexes 164 | sample_subset : array_like 165 | List of sample indexes to select. Must be sorted. 166 | dtype : np.float{32,64} 167 | Data type of the returned array. 168 | 169 | Returns 170 | ------- 171 | dosages : ndarray 172 | Genotype dosages for the selected variants and samples. 173 | """ 174 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 175 | num_variants = len(variant_idxs) 176 | dosages = np.zeros([num_variants, num_samples], dtype=dtype) 177 | with reader as r: 178 | r.read_dosages_list(np.array(variant_idxs, dtype=np.uint32), dosages) 179 | return dosages 180 | 181 | 182 | def read_alleles_list(pgen_path, variant_idxs, sample_subset=None): 183 | """ 184 | Get alleles for a list of variants. 185 | 186 | Parameters 187 | ---------- 188 | pgen_path : str 189 | Path of PLINK 2 pgen file 190 | variant_idxs : array_like 191 | List of variant indexes 192 | sample_subset : array_like 193 | List of sample indexes to select. Must be sorted. 194 | 195 | Returns 196 | ------- 197 | alleles : ndarray 198 | Alleles for the selected variants and samples. 199 | """ 200 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 201 | num_variants = len(variant_idxs) 202 | alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32) 203 | with reader as r: 204 | r.read_alleles_list(np.array(variant_idxs, dtype=np.uint32), alleles) 205 | return alleles 206 | 207 | 208 | def read_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.int8): 209 | """ 210 | Get genotypes for a range of variants. 211 | 212 | Parameters 213 | ---------- 214 | pgen_path : str 215 | Path of PLINK 2 pgen file 216 | start_idx : int 217 | Start index of the range to query. 218 | end_idx : int 219 | End index of the range to query (inclusive). 220 | sample_subset : array_like 221 | List of sample indexes to select. Must be sorted. 222 | dtype : np.int{8,32,64} 223 | Data type of the returned array. 224 | 225 | Returns 226 | ------- 227 | dosages : ndarray 228 | Genotypes for the selected variants and samples. 229 | """ 230 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 231 | num_variants = end_idx - start_idx + 1 232 | genotypes = np.zeros([num_variants, num_samples], dtype=dtype) 233 | with reader as r: 234 | r.read_range(start_idx, end_idx+1, genotypes) 235 | return genotypes 236 | 237 | 238 | def read_dosages_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.float32): 239 | """ 240 | Get dosages for a range of variants. 241 | 242 | Parameters 243 | ---------- 244 | pgen_path : str 245 | Path of PLINK 2 pgen file 246 | start_idx : int 247 | Start index of the range to query. 248 | end_idx : int 249 | End index of the range to query (inclusive). 250 | sample_subset : array_like 251 | List of sample indexes to select. Must be sorted. 252 | dtype : np.float{32,64} 253 | Data type of the returned array. 254 | 255 | Returns 256 | ------- 257 | dosages : ndarray 258 | Genotype dosages for the selected variants and samples. 259 | """ 260 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 261 | num_variants = end_idx - start_idx + 1 262 | dosages = np.zeros([num_variants, num_samples], dtype=dtype) 263 | with reader as r: 264 | r.read_dosages_range(start_idx, end_idx+1, dosages) 265 | return dosages 266 | 267 | 268 | def read_alleles_range(pgen_path, start_idx, end_idx, sample_subset=None): 269 | """ 270 | Get alleles for a range of variants. 271 | 272 | Parameters 273 | ---------- 274 | pgen_path : str 275 | Path of PLINK 2 pgen file 276 | start_idx : int 277 | Start index of the range to query. 278 | end_idx : int 279 | End index of the range to query (inclusive). 280 | sample_subset : array_like 281 | List of sample indexes to select. Must be sorted. 282 | 283 | Returns 284 | ------- 285 | alleles : ndarray 286 | Alleles for the selected variants and samples. 287 | """ 288 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset) 289 | num_variants = end_idx - start_idx + 1 290 | alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32) 291 | with reader as r: 292 | r.read_alleles_range(start_idx, end_idx+1, alleles) 293 | return alleles 294 | 295 | 296 | def _impute_mean(genotypes): 297 | """Impute missing genotypes to mean""" 298 | m = genotypes == -9 299 | if genotypes.ndim == 1 and any(m): 300 | genotypes[m] = genotypes[~m].mean() 301 | else: # genotypes.ndim == 2 302 | ix = np.nonzero(m)[0] 303 | if len(ix) > 0: 304 | a = genotypes.sum(1) 305 | b = m.sum(1) 306 | mu = (a + 9*b) / (genotypes.shape[1] - b) 307 | genotypes[m] = mu[ix] 308 | 309 | 310 | class PgenReader(object): 311 | """ 312 | Class for reading genotype data from PLINK 2 pgen files 313 | 314 | To generate the pgen/psam/pvar files from a VCF, run 315 | plink2 --vcf ${vcf_file} --output-chr chrM --out ${plink_prefix_path} 316 | To use dosages, run: 317 | plink2 --vcf ${vcf_file} 'dosage=DS' --output-chr chrM --out ${plink_prefix_path} 318 | 319 | Requires pgenlib: https://github.com/chrchang/plink-ng/tree/master/2.0/Python 320 | """ 321 | def __init__(self, plink_prefix_path, select_samples=None): 322 | """ 323 | plink_prefix_path: prefix to PLINK pgen,psam,pvar files 324 | select_samples: specify a subset of samples 325 | """ 326 | 327 | if os.path.exists(f"{plink_prefix_path}.pvar.parquet"): 328 | self.pvar_df = pd.read_parquet(f"{plink_prefix_path}.pvar.parquet") 329 | else: 330 | self.pvar_df = read_pvar(f"{plink_prefix_path}.pvar") 331 | self.psam_df = read_psam(f"{plink_prefix_path}.psam") 332 | self.pgen_file = f"{plink_prefix_path}.pgen" 333 | 334 | self.num_variants = self.pvar_df.shape[0] 335 | self.variant_ids = self.pvar_df['id'].tolist() 336 | self.variant_idx_dict = {i:k for k,i in enumerate(self.variant_ids)} 337 | 338 | self.sample_id_list = self.psam_df.index.tolist() 339 | self.set_samples(select_samples) 340 | 341 | variant_df = self.pvar_df.set_index('id')[['chrom', 'pos']] 342 | variant_df['index'] = np.arange(variant_df.shape[0]) 343 | self.variant_df = variant_df 344 | self.variant_dfs = {c:g[['pos', 'index']] for c,g in variant_df.groupby('chrom', sort=False)} 345 | 346 | def set_samples(self, sample_ids=None, sort=True): 347 | """ 348 | Set samples to load. 349 | 350 | Parameters 351 | ---------- 352 | sample_ids : array_like 353 | List of samples to select. 354 | sort : bool 355 | Preserve sample order from pgen file. 356 | """ 357 | if sample_ids is None: 358 | self.sample_ids = self.sample_id_list 359 | self.sample_idxs = None 360 | else: 361 | sample_idxs = [self.sample_id_list.index(i) for i in sample_ids] 362 | if sort: 363 | sidx = np.argsort(sample_idxs) 364 | sample_idxs = [sample_idxs[i] for i in sidx] 365 | sample_ids = [sample_ids[i] for i in sidx] 366 | self.sample_ids = sample_ids 367 | self.sample_idxs = sample_idxs 368 | 369 | def get_range(self, region, start=None, end=None): 370 | """ 371 | Get variant indexes corresponding to region specified as 'chr:start-end', or as chr, start, end. 372 | 373 | Parameters 374 | ---------- 375 | region : str 376 | Genomic region, defined as 'chr:start-end' (1-based, inclusive), or chromosome. 377 | start : int 378 | Start position of the genomic interval (if chromosome is provided in fist argument). 379 | end : int 380 | End position of the genomic interval (if chromosome is provided in fist argument). 381 | 382 | Returns 383 | ------- 384 | indexes : ndarray 385 | [start, end] indexes (inclusive) 386 | """ 387 | if start is None and end is None: 388 | if ':' in region: 389 | chrom, pos = region.split(':') 390 | start, end = [int(i) for i in pos.split('-')] 391 | else: # full chromosome selected 392 | chrom = region 393 | return self.variant_dfs[chrom]['index'].values[[0, -1]] 394 | else: # input is chr, start, end 395 | chrom = region 396 | 397 | lb = bisect.bisect_left(self.variant_dfs[chrom]['pos'].values, start) 398 | ub = bisect.bisect_right(self.variant_dfs[chrom]['pos'].values, end) 399 | if lb != ub: 400 | r = self.variant_dfs[chrom]['index'].values[[lb, ub - 1]] 401 | else: 402 | r = [] 403 | return r 404 | 405 | def read(self, variant_id, impute_mean=True, dtype=np.float32): 406 | """Read genotypes for an individual variant as 0,1,2,-9; impute missing values (-9) to mean (default).""" 407 | variant_idx = self.variant_idx_dict[variant_id] 408 | genotypes = read(self.pgen_file, variant_idx, sample_subset=self.sample_idxs, 409 | dtype=np.int8).astype(dtype) 410 | if impute_mean: 411 | _impute_mean(genotypes) 412 | return pd.Series(genotypes, index=self.sample_ids, name=variant_id) 413 | 414 | def read_list(self, variant_ids, impute_mean=True, dtype=np.float32): 415 | """Read genotypes for an list of variants as 0,1,2,-9; impute missing values (-9) to mean (default).""" 416 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids] 417 | genotypes = read_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs, 418 | dtype=np.int8).astype(dtype) 419 | if impute_mean: 420 | _impute_mean(genotypes) 421 | return pd.DataFrame(genotypes, index=variant_ids, columns=self.sample_ids) 422 | 423 | def read_range(self, start_idx, end_idx, impute_mean=True, dtype=np.float32): 424 | """Read genotypes for range of variants as 0,1,2,-9; impute missing values (-9) to mean (default).""" 425 | genotypes = read_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs, 426 | dtype=np.int8).astype(dtype) 427 | if impute_mean: 428 | _impute_mean(genotypes) 429 | return pd.DataFrame(genotypes, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids) 430 | 431 | def read_region(self, region, start_pos=None, end_pos=None, impute_mean=True, dtype=np.float32): 432 | """Read genotypes for variants in a genomic region as 0,1,2,-9; impute missing values (-9) to mean (default).""" 433 | r = self.get_range(region, start_pos, end_pos) 434 | if len(r) > 0: 435 | return self.read_range(*r, impute_mean=impute_mean, dtype=dtype) 436 | 437 | def read_dosages(self, variant_id, dtype=np.float32): 438 | variant_idx = self.variant_idx_dict[variant_id] 439 | dosages = read_dosages(self.pgen_file, variant_idx, sample_subset=self.sample_idxs, dtype=dtype) 440 | return pd.Series(dosages, index=self.sample_ids, name=variant_id) 441 | 442 | def read_dosages_list(self, variant_ids, dtype=np.float32): 443 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids] 444 | dosages = read_dosages_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs, dtype=dtype) 445 | return pd.DataFrame(dosages, index=variant_ids, columns=self.sample_ids) 446 | 447 | def read_dosages_range(self, start_idx, end_idx, dtype=np.float32): 448 | dosages = read_dosages_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs, dtype=dtype) 449 | return pd.DataFrame(dosages, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids) 450 | 451 | def read_dosages_region(self, region, start_pos=None, end_pos=None, dtype=np.float32): 452 | r = self.get_range(region, start_pos, end_pos) 453 | if len(r) > 0: 454 | return self.read_dosages_range(*r, dtype=dtype) 455 | 456 | def read_alleles(self, variant_id): 457 | variant_idx = self.variant_idx_dict[variant_id] 458 | alleles = read_alleles(self.pgen_file, variant_idx, sample_subset=self.sample_idxs) 459 | s1 = pd.Series(alleles[::2], index=self.sample_ids, name=variant_id) 460 | s2 = pd.Series(alleles[1::2], index=self.sample_ids, name=variant_id) 461 | return s1, s2 462 | 463 | def read_alleles_list(self, variant_ids): 464 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids] 465 | alleles = read_alleles_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs) 466 | df1 = pd.DataFrame(alleles[:,::2], index=variant_ids, columns=self.sample_ids) 467 | df2 = pd.DataFrame(alleles[:,1::2], index=variant_ids, columns=self.sample_ids) 468 | return df1, df2 469 | 470 | def read_alleles_range(self, start_idx, end_idx): 471 | alleles = read_alleles_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs) 472 | df1 = pd.DataFrame(alleles[:,::2], index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids) 473 | df2 = pd.DataFrame(alleles[:,1::2], index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids) 474 | return df1, df2 475 | 476 | def read_alleles_region(self, region, start_pos=None, end_pos=None): 477 | r = self.get_range(region, start_pos, end_pos) 478 | if len(r) > 0: 479 | return self.read_alleles_range(*r) 480 | else: 481 | return None, None 482 | 483 | def load_genotypes(self): 484 | """Load all genotypes as np.int8, without imputing missing values.""" 485 | genotypes = read_range(self.pgen_file, 0, self.num_variants-1, sample_subset=self.sample_idxs) 486 | return pd.DataFrame(genotypes, index=self.variant_ids, columns=self.sample_ids) 487 | 488 | def load_dosages(self): 489 | """Load all dosages.""" 490 | return self.read_dosages_range(0, self.num_variants-1) 491 | 492 | def load_alleles(self): 493 | """Load all alleles.""" 494 | return self.read_alleles_range(0, self.num_variants-1) 495 | 496 | def get_pairwise_ld(self, id1, id2, r2=True, dtype=np.float32): 497 | """Compute pairwise LD (R2) between (lists of) variants""" 498 | if isinstance(id1, str) and isinstance(id2, str): 499 | g1 = self.read(id1, dtype=dtype) 500 | g2 = self.read(id2, dtype=dtype) 501 | g1 -= g1.mean() 502 | g2 -= g2.mean() 503 | if r2: 504 | r = (g1 * g2).sum()**2 / ( (g1**2).sum() * (g2**2).sum() ) 505 | else: 506 | r = (g1 * g2).sum() / np.sqrt( (g1**2).sum() * (g2**2).sum() ) 507 | elif isinstance(id1, str): 508 | g1 = self.read(id1, dtype=dtype) 509 | g2 = self.read_list(id2, dtype=dtype) 510 | g1 -= g1.mean() 511 | g2 -= g2.values.mean(1, keepdims=True) 512 | if r2: 513 | r = (g1 * g2).sum(1)**2 / ( (g1**2).sum() * (g2**2).sum(1) ) 514 | else: 515 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum() * (g2**2).sum(1) ) 516 | elif isinstance(id2, str): 517 | g1 = self.read_list(id1, dtype=dtype) 518 | g2 = self.read(id2, dtype=dtype) 519 | g1 -= g1.values.mean(1, keepdims=True) 520 | g2 -= g2.mean() 521 | if r2: 522 | r = (g1 * g2).sum(1)**2 / ( (g1**2).sum(1) * (g2**2).sum() ) 523 | else: 524 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum() ) 525 | else: 526 | assert len(id1) == len(id2) 527 | g1 = self.read_list(id1, dtype=dtype).values 528 | g2 = self.read_list(id2, dtype=dtype).values 529 | g1 -= g1.mean(1, keepdims=True) 530 | g2 -= g2.mean(1, keepdims=True) 531 | if r2: 532 | r = (g1 * g2).sum(1) ** 2 / ( (g1**2).sum(1) * (g2**2).sum(1) ) 533 | else: 534 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum(1) ) 535 | return r 536 | 537 | def get_ld_matrix(self, variant_ids, dtype=np.float32): 538 | g = self.read_list(variant_ids, dtype=dtype).values 539 | return pd.DataFrame(np.corrcoef(g), index=variant_ids, columns=variant_ids) 540 | 541 | 542 | def load_dosages_df(plink_prefix_path, select_samples=None): 543 | """ 544 | Load dosages for all variants and all/selected samples as a dataframe. 545 | 546 | Parameters 547 | ---------- 548 | plink_prefix_path : str 549 | Prefix to .pgen/.psam/.pvar files 550 | select_samples : array_like 551 | List of sample IDs to select. Default: all samples. 552 | 553 | Returns 554 | ------- 555 | dosages_df : pd.DataFrame (variants x samples) 556 | Genotype dosages for the selected samples. 557 | """ 558 | p = Pgen(plink_prefix_path, select_samples=select_samples) 559 | return p.load_dosages_df() 560 | -------------------------------------------------------------------------------- /tensorqtl/post.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | import scipy.stats as stats 5 | import subprocess 6 | import sys 7 | import os 8 | import glob 9 | from datetime import datetime 10 | 11 | sys.path.insert(1, os.path.dirname(__file__)) 12 | from core import * 13 | import mixqtl 14 | import qtl.genotype as gt 15 | 16 | 17 | def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None): 18 | """Annotate permutation results with q-values, p-value threshold""" 19 | if logger is None: 20 | logger = SimpleLogger() 21 | 22 | logger.write('Computing q-values') 23 | logger.write(f' * Number of phenotypes tested: {res_df.shape[0]}') 24 | 25 | if not res_df['pval_beta'].isnull().all(): 26 | pval_col = 'pval_beta' 27 | r = stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0] 28 | logger.write(f' * Correlation between Beta-approximated and empirical p-values: {r:.4f}') 29 | else: 30 | pval_col = 'pval_perm' 31 | logger.write(f' * WARNING: no beta-approximated p-values found, using permutation p-values instead.') 32 | 33 | # calculate q-values 34 | if qvalue_lambda is not None: 35 | logger.write(f' * Calculating q-values with lambda = {qvalue_lambda:.3f}') 36 | qval, pi0 = rfunc.qvalue(res_df[pval_col], lambda_qvalue=qvalue_lambda) 37 | 38 | res_df['qval'] = qval 39 | logger.write(f' * Proportion of significant phenotypes (1-pi0): {1-pi0:.2f}') 40 | logger.write(f" * QTL phenotypes @ FDR {fdr:.2f}: {(res_df['qval'] <= fdr).sum()}") 41 | 42 | # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene 43 | if pval_col == 'pval_beta': 44 | lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values() 45 | ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values() 46 | 47 | if len(lb) > 0: # significant phenotypes 48 | lb = lb.iloc[-1] 49 | if len(ub) > 0: 50 | ub = ub.iloc[0] 51 | pthreshold = (lb+ub)/2 52 | else: 53 | pthreshold = lb 54 | logger.write(f' * min p-value threshold @ FDR {fdr}: {pthreshold:.6g}') 55 | res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2']) 56 | 57 | 58 | def calculate_afc(assoc_df, counts_df, genotype_df, variant_df=None, covariates_df=None, 59 | select_covariates=True, group='gene_id', 60 | imputation='offset', count_threshold=0, verbose=True): 61 | """ 62 | Calculate allelic fold-change (aFC) for variant-gene pairs 63 | 64 | Inputs 65 | assoc_df: dataframe containing variant-gene associations, must have 'gene_id' 66 | and 'variant_id' columns. If multiple variants/gene are detected, effects 67 | are estimated jointly. 68 | genotype_df: genotype dosages 69 | counts_df: read counts scaled with DESeq size factors. Zeros are imputed using 70 | log(counts + 1) (imputation='offset'; default) or with half-minimum 71 | (imputation='half_min'). 72 | covariates_df: covariates (genotype PCs, PEER factors, etc.) 73 | 74 | aFC [1] is computed using the total read count (trc) model from mixQTL [2]. 75 | 76 | [1] Mohammadi et al., 2017 (genome.cshlp.org/content/27/11/1872) 77 | [2] Liang et al., 2021 (10.1038/s41467-021-21592-8) 78 | """ 79 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 80 | 81 | if variant_df is not None: 82 | gi = gt.GenotypeIndexer(genotype_df, variant_df) 83 | else: 84 | assert isinstance(genotype_df, gt.GenotypeIndexer) 85 | gi = genotype_df 86 | genotype_ix = np.array([gi.genotype_df.columns.tolist().index(i) for i in counts_df.columns]) 87 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device) 88 | 89 | if covariates_df is not None: 90 | covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device) 91 | else: 92 | covariates_t = None 93 | 94 | afc_df = [] 95 | n = len(assoc_df[group].unique()) 96 | for k, (phenotype_id, gdf) in enumerate(assoc_df.groupby(group, sort=False), 1): 97 | if verbose and k % 10 == 0 or k == n: 98 | print(f"\rCalculating aFC for {group.replace('_id','')} {k}/{n}", end='' if k != n else None, flush=True) 99 | 100 | counts_t = torch.tensor(counts_df.loc[phenotype_id].values, 101 | dtype=torch.float32).to(device) 102 | genotypes_t = torch.tensor(gi.get_genotypes(gdf['variant_id'].tolist()), dtype=torch.float32).to(device) 103 | genotypes_t = genotypes_t[:,genotype_ix_t] 104 | impute_mean(genotypes_t) 105 | try: 106 | b, b_se = mixqtl.trc(genotypes_t, counts_t, covariates_t=covariates_t, 107 | select_covariates=select_covariates, count_threshold=count_threshold, 108 | imputation=imputation, mode='multi', return_af=False) 109 | gdf['afc'] = b.cpu().numpy() * np.log2(np.e) 110 | gdf['afc_se'] = b_se.cpu().numpy() * np.log2(np.e) 111 | afc_df.append(gdf) 112 | except: 113 | print(f'WARNING: aFC calculation failed for {phenotype_id}') 114 | afc_df = pd.concat(afc_df) 115 | 116 | return afc_df 117 | 118 | 119 | def calculate_replication(res_df, genotypes, phenotype_df, covariates_df=None, paired_covariate_df=None, 120 | interaction_s=None, compute_pi1=False, lambda_qvalue=None, logp=False): 121 | """res_df: DataFrame with 'variant_id' column and phenotype IDs as index""" 122 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 123 | 124 | if paired_covariate_df is not None: 125 | assert paired_covariate_df.index.equals(covariates_df.index) 126 | assert paired_covariate_df.columns.isin(phenotype_df.index).all() 127 | 128 | if isinstance(genotypes, pd.DataFrame): 129 | genotypes_t = torch.tensor(genotypes.loc[res_df['variant_id']].values, dtype=torch.float).to(device) 130 | genotype_ix = np.array([genotypes.columns.tolist().index(i) for i in phenotype_df.columns]) 131 | else: # pgen.PgenReader 132 | gt_df = genotypes.read_list(res_df['variant_id'], impute_mean=False) 133 | genotypes_t = torch.tensor(gt_df.values, dtype=torch.float).to(device) 134 | genotype_ix = np.array([gt_df.columns.tolist().index(i) for i in phenotype_df.columns]) 135 | 136 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device) 137 | genotypes_t = genotypes_t[:,genotype_ix_t] 138 | impute_mean(genotypes_t) 139 | af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t) 140 | 141 | phenotypes_t = torch.tensor(phenotype_df.loc[res_df.index].values, dtype=torch.float32).to(device) 142 | 143 | if covariates_df is not None: 144 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) 145 | # dof -= covariates_df.shape[1] 146 | else: 147 | residualizer = None 148 | 149 | if interaction_s is None: 150 | if paired_covariate_df is None: 151 | if residualizer is not None: 152 | genotype_res_t = residualizer.transform(genotypes_t) # variants x samples 153 | phenotype_res_t = residualizer.transform(phenotypes_t) # phenotypes x samples 154 | dof = residualizer.dof 155 | dof_t = dof 156 | else: 157 | genotype_res_t = genotypes_t 158 | phenotype_res_t = phenotypes_t 159 | dof = phenotypes_t.shape[1] - 2 160 | dof_t = dof 161 | else: 162 | genotype_res_t = torch.zeros_like(genotypes_t).to(device) 163 | phenotype_res_t = torch.zeros_like(phenotypes_t).to(device) 164 | dof = [] 165 | for k,phenotype_id in enumerate(res_df.index): 166 | if phenotype_id in paired_covariate_df: 167 | iresidualizer = Residualizer(torch.tensor(np.c_[covariates_df, paired_covariate_df[phenotype_id]], 168 | dtype=torch.float32).to(device)) 169 | else: 170 | iresidualizer = residualizer 171 | genotype_res_t[[k]] = iresidualizer.transform(genotypes_t[[k]]) 172 | phenotype_res_t[[k]] = iresidualizer.transform(phenotypes_t[[k]]) 173 | dof.append(iresidualizer.dof) 174 | dof = np.array(dof) 175 | dof_t = torch.Tensor(dof).to(device) 176 | 177 | gstd = genotype_res_t.var(1) 178 | pstd = phenotype_res_t.var(1) 179 | std_ratio_t = torch.sqrt(pstd / gstd) 180 | 181 | # center and normalize 182 | genotype_res_t = center_normalize(genotype_res_t, dim=1) 183 | phenotype_res_t = center_normalize(phenotype_res_t, dim=1) 184 | 185 | r_nominal_t = (genotype_res_t * phenotype_res_t).sum(1) 186 | r2_nominal_t = r_nominal_t.double().pow(2) 187 | 188 | tstat_t = torch.sqrt((dof_t * r2_nominal_t) / (1 - r2_nominal_t)) 189 | slope_t = r_nominal_t * std_ratio_t 190 | slope_se_t = (slope_t.abs().double() / tstat_t).float() 191 | pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof) 192 | 193 | rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(), pval, slope_t.cpu(), slope_se_t.cpu()], 194 | columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af', 'pval_nominal', 'slope', 'slope_se']).infer_objects() 195 | 196 | else: 197 | if paired_covariate_df is not None: 198 | raise NotImplementedError("Paired covariates are not yet supported for interactions") 199 | 200 | interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device) 201 | ng, ns = genotypes_t.shape 202 | nps = phenotypes_t.shape[0] 203 | 204 | # centered inputs 205 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True) 206 | gi_t = genotypes_t * interaction_t 207 | gi0_t = gi_t - gi_t.mean(1, keepdim=True) 208 | i0_t = interaction_t - interaction_t.mean() 209 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True) 210 | 211 | # residualize rows 212 | g0_t = residualizer.transform(g0_t, center=False) 213 | gi0_t = residualizer.transform(gi0_t, center=False) 214 | p0_t = residualizer.transform(p0_t, center=False) # np x ns 215 | i0_t = residualizer.transform(i0_t, center=False) 216 | i0_t = i0_t.repeat(ng, 1) 217 | 218 | # regression (in float; loss of precision may occur in edge cases) 219 | X_t = torch.stack([g0_t, i0_t, gi0_t], 2) # ng x ns x 3 220 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3 221 | b_t = (torch.matmul(Xinv, torch.transpose(X_t, 1, 2)) * p0_t.unsqueeze(1)).sum(2) # ng x 3 222 | r_t = (X_t * b_t.unsqueeze(1)).sum(2) - p0_t 223 | dof = residualizer.dof - 2 224 | rss_t = (r_t*r_t).sum(1) # ng x np 225 | b_se_t = torch.sqrt( Xinv[:, torch.eye(3, dtype=torch.uint8).bool()] * rss_t.unsqueeze(-1) / dof ) 226 | tstat_t = (b_t.double() / b_se_t.double()).float() 227 | pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof) 228 | b = b_t.cpu() 229 | b_se = b_se_t.cpu() 230 | 231 | rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(), 232 | pval[:,0], b[:,0], b_se[:,0], pval[:,1], b[:,1], b_se[:,1], pval[:,2], b[:,2], b_se[:,2]], 233 | columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af', 234 | 'pval_g', 'b_g', 'b_g_se', 'pval_i', 'b_i', 'b_i_se', 'pval_gi', 'b_gi', 'b_gi_se']).infer_objects() 235 | pval = pval[:,2] 236 | 237 | if compute_pi1: 238 | try: 239 | pi1 = 1 - rfunc.pi0est(pval, lambda_qvalue=lambda_qvalue)[0] 240 | except: 241 | pi1 = np.nan 242 | return pi1, rep_df 243 | else: 244 | return rep_df 245 | 246 | 247 | def annotate_genes(gene_df, annotation_gtf, lookup_df=None): 248 | """ 249 | Add gene and variant annotations (e.g., gene_name, rs_id, etc.) to gene-level output 250 | 251 | gene_df: output from map_cis() 252 | annotation_gtf: gene annotation in GTF format 253 | lookup_df: DataFrame with variant annotations, indexed by 'variant_id' 254 | """ 255 | gene_dict = {} 256 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Adding gene and variant annotations', flush=True) 257 | print(' * parsing GTF', flush=True) 258 | with open(annotation_gtf) as gtf: 259 | for row in gtf: 260 | row = row.strip().split('\t') 261 | if row[0][0] == '#' or row[2] != 'gene': continue 262 | # get gene_id and gene_name from attributes 263 | attr = dict([i.split() for i in row[8].replace('"','').split(';') if i!='']) 264 | # gene_name, gene_chr, gene_start, gene_end, strand 265 | gene_dict[attr['gene_id']] = [attr['gene_name'], row[0], row[3], row[4], row[6]] 266 | 267 | print(' * annotating genes', flush=True) 268 | if 'group_id' in gene_df: 269 | gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df['group_id']], 270 | columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'], 271 | index=gene_df.index) 272 | else: 273 | gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df.index], 274 | columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'], 275 | index=gene_df.index) 276 | gene_df = pd.concat([gene_info, gene_df], axis=1) 277 | assert np.all(gene_df.index == gene_info.index) 278 | 279 | col_order = ['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand', 280 | 'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id'] 281 | if 'tss_distance' in gene_df: 282 | col_order += ['tss_distance'] 283 | else: 284 | col_order += ['start_distance', 'end_distance'] 285 | if lookup_df is not None: 286 | print(' * adding variant annotations from lookup table', flush=True) 287 | gene_df = gene_df.join(lookup_df, on='variant_id') # add variant information 288 | col_order += list(lookup_df.columns) 289 | col_order += ['ma_samples', 'ma_count', 'af', 'pval_nominal', 290 | 'slope', 'slope_se', 'pval_perm', 'pval_beta'] 291 | if 'group_id' in gene_df: 292 | col_order += ['group_id', 'group_size'] 293 | col_order += ['qval', 'pval_nominal_threshold'] 294 | gene_df = gene_df[col_order] 295 | print('done.', flush=True) 296 | return gene_df 297 | 298 | 299 | def get_significant_pairs(res_df, nominal_files, group_s=None, fdr=0.05): 300 | """Significant variant-phenotype pairs based on nominal p-value threshold for each phenotype""" 301 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] tensorQTL: parsing all significant variant-phenotype pairs', flush=True) 302 | assert 'qval' in res_df 303 | 304 | # significant phenotypes (apply FDR threshold) 305 | if group_s is not None: 306 | df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta', 'group_id']].copy() 307 | df.set_index('group_id', inplace=True) 308 | else: 309 | df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta']].copy() 310 | df.rename(columns={'pval_nominal': 'min_pval_nominal'}, inplace=True) 311 | signif_phenotype_ids = set(df.index) 312 | threshold_dict = df['pval_nominal_threshold'].to_dict() 313 | 314 | if isinstance(nominal_files, str): 315 | # chr -> file 316 | nominal_files = {os.path.basename(i).split('.')[-2]:i for i in glob.glob(nominal_files+'*.parquet')} 317 | else: 318 | assert isinstance(nominal_files, dict) 319 | 320 | chroms = sorted(nominal_files.keys(), key=lambda x: int(x.replace('chr', '').replace('X', '23'))) 321 | signif_df = [] 322 | for k,c in enumerate(chroms, 1): 323 | print(f' * processing chr. {k}/{len(chroms)}', end='\r', flush=True) 324 | nominal_df = pd.read_parquet(nominal_files[c]) 325 | # drop pairs that never pass threshold 326 | nominal_df = nominal_df[nominal_df['pval_nominal'] <= df['pval_nominal_threshold'].max()] 327 | if group_s is not None: 328 | nominal_df.insert(1, 'group_id', nominal_df['phenotype_id'].map(group_s)) 329 | nominal_df = nominal_df[nominal_df['group_id'].isin(signif_phenotype_ids)] 330 | m = nominal_df['pval_nominal'] < nominal_df['group_id'].apply(lambda x: threshold_dict[x]) 331 | else: 332 | nominal_df = nominal_df[nominal_df['phenotype_id'].isin(signif_phenotype_ids)] 333 | m = nominal_df['pval_nominal'] < nominal_df['phenotype_id'].apply(lambda x: threshold_dict[x]) 334 | signif_df.append(nominal_df[m]) 335 | print() 336 | signif_df = pd.concat(signif_df, axis=0) 337 | if group_s is not None: 338 | signif_df = signif_df.merge(df, left_on='group_id', right_index=True) 339 | else: 340 | signif_df = signif_df.merge(df, left_on='phenotype_id', right_index=True) 341 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] done', flush=True) 342 | return signif_df.reset_index(drop=True) 343 | -------------------------------------------------------------------------------- /tensorqtl/rfunc.py: -------------------------------------------------------------------------------- 1 | # Author: Francois Aguet 2 | import numpy as np 3 | import rpy2 4 | from rpy2.robjects.packages import importr 5 | from collections.abc import Iterable 6 | from contextlib import contextmanager 7 | 8 | # silence R warnings 9 | from rpy2.rinterface_lib.callbacks import logger as rpy2_logger 10 | import logging 11 | rpy2_logger.setLevel(logging.ERROR) 12 | 13 | @contextmanager 14 | def suppress_stdout(): 15 | with open(os.devnull, "w") as devnull: 16 | old_stdout = sys.stdout 17 | sys.stdout = devnull 18 | try: 19 | yield 20 | finally: 21 | sys.stdout = old_stdout 22 | 23 | 24 | def p_adjust(p, method='BH'): 25 | """Wrapper for p.adjust""" 26 | rp = rpy2.robjects.vectors.FloatVector(p) 27 | p_adjust = rpy2.robjects.r['p.adjust'] 28 | return np.array(p_adjust(rp, method=method)) 29 | 30 | 31 | def t_cdf(t, df, lower_tail=False, log=True): 32 | """Wrapper for pt""" 33 | scalar = True 34 | if isinstance(t, Iterable): 35 | rt = rpy2.robjects.vectors.FloatVector(t) 36 | scalar = False 37 | else: 38 | rt = t 39 | if isinstance(df, Iterable): 40 | rdf = rpy2.robjects.vectors.FloatVector(df) 41 | scalar = False 42 | else: 43 | rdf = df 44 | r_pt = rpy2.robjects.r['pt'] 45 | res = np.array(r_pt(rt, rdf, lower_tail=lower_tail, log=log)) 46 | if scalar: 47 | res = res[0] 48 | return res 49 | 50 | 51 | def qvalue(p, lambda_qvalue=None): 52 | """Wrapper for qvalue::qvalue""" 53 | qvalue = importr("qvalue") 54 | rp = rpy2.robjects.vectors.FloatVector(p) 55 | if lambda_qvalue is None: 56 | q = qvalue.qvalue(rp) 57 | else: 58 | if not isinstance(lambda_qvalue, Iterable): 59 | lambda_qvalue = [lambda_qvalue] 60 | rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue) 61 | q = qvalue.qvalue(rp, **{'lambda':rlambda}) 62 | qval = np.array(q.rx2('qvalues')) 63 | pi0 = np.array(q.rx2('pi0'))[0] 64 | return qval, pi0 65 | 66 | 67 | def pi0est(p, lambda_qvalue=None): 68 | """Wrapper for qvalue::pi0est""" 69 | qvalue = importr("qvalue") 70 | rp = rpy2.robjects.vectors.FloatVector(p) 71 | # with suppress_stdout(): 72 | if lambda_qvalue is None: 73 | pi0res = qvalue.pi0est(rp) 74 | else: 75 | if not isinstance(lambda_qvalue, Iterable): 76 | lambda_qvalue = [lambda_qvalue] 77 | rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue) 78 | pi0res = qvalue.pi0est(rp, rlambda) 79 | pi0 = np.array(pi0res.rx2('pi0'))[0] 80 | pi0_lambda = np.array(pi0res.rx2('pi0.lambda')) 81 | lambda_vec = np.array(pi0res.rx2('lambda')) 82 | pi0_smooth = np.array(pi0res.rx2('pi0.smooth')) 83 | return pi0, pi0_lambda, lambda_vec, pi0_smooth 84 | -------------------------------------------------------------------------------- /tensorqtl/tensorqtl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import print_function 3 | import pandas as pd 4 | import numpy as np 5 | from datetime import datetime 6 | import sys 7 | import os 8 | import re 9 | import pickle 10 | import argparse 11 | from collections import defaultdict 12 | import importlib.metadata 13 | 14 | sys.path.insert(1, os.path.dirname(__file__)) 15 | from core import * 16 | from post import * 17 | import genotypeio, cis, trans, susie 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(description='tensorQTL: GPU-based QTL mapper') 22 | parser.add_argument('genotype_path', help='Genotypes in PLINK format') 23 | parser.add_argument('phenotypes', help="Phenotypes in BED format (.bed, .bed.gz, .bed.parquet), or optionally for 'trans' mode, parquet or tab-delimited.") 24 | parser.add_argument('prefix', help='Prefix for output file names') 25 | parser.add_argument('--mode', type=str, default='cis', choices=['cis', 'cis_nominal', 'cis_independent', 'cis_susie', 'trans', 'trans_susie'], 26 | help='Mapping mode. Default: cis') 27 | parser.add_argument('--covariates', default=None, help='Covariates file, tab-delimited (covariates x samples)') 28 | parser.add_argument('--paired_covariate', default=None, help='Single phenotype-specific covariate, tab-delimited (phenotypes x samples)') 29 | parser.add_argument('--permutations', type=int, default=10000, help='Number of permutations. Default: 10000') 30 | parser.add_argument('--interaction', default=None, type=str, help='Tab-delimited file mapping sample ID to interaction value(s) (if multiple interaction terms are used, the file must include a header with variable names)') 31 | parser.add_argument('--cis_output', default=None, type=str, help="Output from 'cis' mode with q-values. Required for independent cis-QTL mapping.") 32 | parser.add_argument('--phenotype_groups', default=None, type=str, help='Phenotype groups. Header-less TSV with two columns: phenotype_id, group_id') 33 | parser.add_argument('--window', default=1000000, type=np.int32, help='Cis-window size, in bases. Default: 1000000.') 34 | parser.add_argument('--pval_threshold', default=1e-5, type=np.float64, help='Output only significant phenotype-variant pairs with a p-value below threshold. Default: 1e-5 for trans-QTL') 35 | parser.add_argument('--logp', action='store_true', help='Compute nominal p-values as -log10(P) for added precision (requires R)') 36 | parser.add_argument('--maf_threshold', default=0, type=np.float64, help='Include only genotypes with minor allele frequency >= maf_threshold. Default: 0') 37 | parser.add_argument('--maf_threshold_interaction', default=0.05, type=np.float64, help='MAF threshold for interactions, applied to lower and upper half of samples') 38 | parser.add_argument('--dosages', action='store_true', help='Load dosages instead of genotypes (only applies to PLINK2 bgen input).') 39 | parser.add_argument('--return_dense', action='store_true', help='Return dense output for trans-QTL.') 40 | parser.add_argument('--return_r2', action='store_true', help='Return r2 (only for sparse trans-QTL output)') 41 | parser.add_argument('--best_only', action='store_true', help='Only write lead association for each phenotype (interaction mode only)') 42 | parser.add_argument('--output_text', action='store_true', help='Write output in txt.gz format instead of parquet (trans-QTL mode only)') 43 | parser.add_argument('--batch_size', type=int, default=20000, help='GPU batch size (trans-QTLs only). Reduce this if encountering OOM errors.') 44 | parser.add_argument('--chunk_size', default=None, help="For cis-QTL mapping, load genotypes into CPU memory in chunks of chunk_size variants, or by chromosome if chunk_size is 'chr'.") 45 | parser.add_argument('--susie_loci', default=None, help="Table (parquet or tsv) with loci to fine-map (phenotype_id, chr, pos) with mode 'trans_susie'.") 46 | parser.add_argument('--disable_beta_approx', action='store_true', help='Disable Beta-distribution approximation of empirical p-values (not recommended).') 47 | parser.add_argument('--warn_monomorphic', action='store_true', help='Warn if monomorphic variants are found.') 48 | parser.add_argument('--max_effects', type=int, default=10, help='Maximum number of non-zero effects in the SuSiE regression model.') 49 | parser.add_argument('--fdr', default=0.05, type=np.float64, help='FDR for cis-QTLs') 50 | parser.add_argument('--qvalue_lambda', default=None, type=np.float64, help='lambda parameter for pi0est in qvalue.') 51 | parser.add_argument('--seed', default=None, type=int, help='Seed for permutations.') 52 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory') 53 | args = parser.parse_args() 54 | 55 | # check inputs 56 | if args.mode == 'cis_independent' and (args.cis_output is None or not os.path.exists(args.cis_output)): 57 | raise ValueError("Output from 'cis' mode must be provided.") 58 | if args.interaction is not None and args.mode not in ['cis_nominal', 'trans']: 59 | raise ValueError("Interactions are only supported in 'cis_nominal' or 'trans' mode.") 60 | 61 | logger = SimpleLogger(os.path.join(args.output_dir, f'{args.prefix}.tensorQTL.{args.mode}.log')) 62 | logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Running TensorQTL v{importlib.metadata.version("tensorqtl")}: {args.mode.split("_")[0]}-QTL mapping') 63 | if torch.cuda.is_available(): 64 | logger.write(f' * using GPU ({torch.cuda.get_device_name(torch.cuda.current_device())})') 65 | else: 66 | logger.write(' * WARNING: using CPU!') 67 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 68 | if args.seed is not None: 69 | logger.write(f' * using seed {args.seed}') 70 | 71 | # load inputs 72 | logger.write(f' * reading phenotypes ({args.phenotypes})') 73 | # for cis modes, require BED input with position information 74 | if args.mode.startswith('cis'): 75 | assert args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')), "For cis modes, phenotypes must be in BED format." 76 | phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes) 77 | if phenotype_pos_df.columns[1] == 'pos': 78 | logger.write(f" * cis-window detected as position ± {args.window:,}") 79 | else: 80 | logger.write(f" * cis-window detected as [start - {args.window:,}, end + {args.window:,}]") 81 | elif args.mode.startswith('trans'): 82 | if args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')): 83 | phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes) 84 | else: 85 | if args.phenotypes.endswith('.parquet'): 86 | phenotype_df = pd.read_parquet(args.phenotypes) 87 | else: # assume tab-delimited 88 | phenotype_df = pd.read_csv(args.phenotypes, sep='\t', index_col=0) 89 | phenotype_pos_df = None 90 | 91 | if args.covariates is not None: 92 | logger.write(f' * reading covariates ({args.covariates})') 93 | covariates_df = pd.read_csv(args.covariates, sep='\t', index_col=0).T 94 | assert phenotype_df.columns.equals(covariates_df.index) 95 | else: 96 | covariates_df = None 97 | 98 | if args.paired_covariate is not None: 99 | assert covariates_df is not None, f"Covariates matrix must be provided when using paired covariate" 100 | paired_covariate_df = pd.read_csv(args.paired_covariate, sep='\t', index_col=0) # phenotypes x samples 101 | assert paired_covariate_df.index.isin(phenotype_df.index).all(), f"Paired covariate phenotypes must be present in phenotype matrix." 102 | assert paired_covariate_df.columns.equals(phenotype_df.columns), f"Paired covariate samples must match samples in phenotype matrix." 103 | else: 104 | paired_covariate_df = None 105 | 106 | if args.interaction is not None: 107 | logger.write(f' * reading interaction term(s) ({args.interaction})') 108 | # allow headerless input for single interactions 109 | with open(args.interaction) as f: 110 | f.readline() 111 | s = f.readline().strip() 112 | if len(s.split('\t')) == 2: # index + value 113 | interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0, header=None) 114 | else: 115 | interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0) 116 | # select samples 117 | assert covariates_df.index.isin(interaction_df.index).all() 118 | interaction_df = interaction_df.loc[covariates_df.index].astype(np.float32) 119 | else: 120 | interaction_df = None 121 | 122 | if args.maf_threshold is None: 123 | if args.mode == 'trans': 124 | maf_threshold = 0.05 125 | else: 126 | maf_threshold = 0 127 | else: 128 | maf_threshold = args.maf_threshold 129 | 130 | if args.phenotype_groups is not None: 131 | group_s = pd.read_csv(args.phenotype_groups, sep='\t', index_col=0, header=None).squeeze('columns').rename(None) 132 | # verify sort order 133 | group_dict = group_s.to_dict() 134 | previous_group = '' 135 | parsed_groups = 0 136 | for i in phenotype_df.index: 137 | if group_dict[i] != previous_group: 138 | parsed_groups += 1 139 | previous_group = group_dict[i] 140 | if not parsed_groups == len(group_s.unique()): 141 | raise ValueError('Groups defined in input do not match phenotype file (check sort order).') 142 | else: 143 | group_s = None 144 | 145 | # load genotypes 146 | if args.chunk_size is None: # load all genotypes into memory 147 | logger.write(f' * loading genotype dosages' if args.dosages else f' * loading genotypes') 148 | genotype_df, variant_df = genotypeio.load_genotypes(args.genotype_path, select_samples=phenotype_df.columns, dosages=args.dosages) 149 | if variant_df is None: 150 | assert not args.mode.startswith('cis'), f"Genotype data without variant positions is only supported for mode='trans'." 151 | else: 152 | if not all([os.path.exists(f"{args.genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]): 153 | raise ValueError("Processing in chunks requires PLINK 2 pgen/psam/pvar files.") 154 | import pgen 155 | pgr = pgen.PgenReader(args.genotype_path, select_samples=phenotype_df.columns) 156 | 157 | if args.mode == 'cis': 158 | if args.chunk_size is None: 159 | res_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df, 160 | group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations, 161 | window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold, 162 | warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True) 163 | else: 164 | res_df = [] 165 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size, 166 | dosages=args.dosages, verbose=True): 167 | res_df.append(cis.map_cis(gt_df, var_df, p_df, p_pos_df, covariates_df=covariates_df, 168 | group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations, 169 | window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold, 170 | warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True)) 171 | res_df = pd.concat(res_df) 172 | logger.write(' * writing output') 173 | if has_rpy2: 174 | calculate_qvalues(res_df, fdr=args.fdr, qvalue_lambda=args.qvalue_lambda, logger=logger) 175 | out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.txt.gz') 176 | res_df.to_csv(out_file, sep='\t', float_format='%.6g') 177 | 178 | elif args.mode == 'cis_nominal': 179 | if args.chunk_size is None: 180 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, args.prefix, covariates_df=covariates_df, 181 | paired_covariate_df=paired_covariate_df, interaction_df=interaction_df, 182 | maf_threshold_interaction=args.maf_threshold_interaction, 183 | group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True, 184 | output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True) 185 | # compute significant pairs 186 | if args.cis_output is not None: 187 | cis_df = pd.read_csv(args.cis_output, sep='\t', index_col=0) 188 | nominal_prefix = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl_pairs') 189 | signif_df = get_significant_pairs(cis_df, nominal_prefix, group_s=group_s, fdr=args.fdr) 190 | signif_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.signif_pairs.parquet')) 191 | 192 | else: 193 | chunks = [] 194 | for gt_df, var_df, p_df, p_pos_df, ci in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size, 195 | dosages=args.dosages, verbose=True): 196 | prefix = f"{args.prefix}.chunk{ci+1}" 197 | chunks.append(prefix) 198 | cis.map_nominal(gt_df, var_df, p_df, p_pos_df, prefix, covariates_df=covariates_df, 199 | paired_covariate_df=paired_covariate_df, interaction_df=interaction_df, 200 | maf_threshold_interaction=args.maf_threshold_interaction, 201 | group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True, 202 | output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True) 203 | chunk_files = glob.glob(os.path.join(args.output_dir, f"{args.prefix}.chunk*.cis_qtl_pairs.*.parquet")) 204 | if args.chunk_size == 'chr': # remove redundant chunk ID from file names 205 | for f in chunk_files: 206 | x = re.findall(f"{args.prefix}\.(chunk\d+)", os.path.basename(f)) 207 | assert len(x) == 1 208 | os.rename(f, f.replace(f"{x[0]}.", "")) 209 | else: # concatenate outputs by chromosome 210 | chunk_df = pd.DataFrame({ 211 | 'file': chunk_files, 212 | 'chunk': [int(re.findall(f"{args.prefix}\.chunk(\d+)", os.path.basename(i))[0]) for i in chunk_files], 213 | 'chr': [re.findall("\.cis_qtl_pairs\.(.*)\.parquet", os.path.basename(i))[0] for i in chunk_files], 214 | }).sort_values('chunk') 215 | for chrom, chr_df in chunk_df.groupby('chr', sort=False): 216 | print(f"\rConcatenating outputs for {chrom}", end='' if chrom != chunk_df['chr'].iloc[-1] else None) 217 | pd.concat([pd.read_parquet(f) for f in chr_df['file']]).reset_index(drop=True).to_parquet( 218 | os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_pairs.{chrom}.parquet")) 219 | for f in chr_df['file']: 220 | os.remove(f) 221 | # concatenate interaction results 222 | if interaction_df is not None: 223 | chunk_files = [os.path.join(args.output_dir, f"{c}.cis_qtl_top_assoc.txt.gz") for c in chunks] 224 | pd.concat([pd.read_csv(f, sep='\t', index_col=0, dtype=str) for f in chunk_files]).to_csv( 225 | os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_top_assoc.txt.gz"), sep='\t') 226 | for f in chunk_files: 227 | os.remove(f) 228 | 229 | elif args.mode == 'cis_independent': 230 | summary_df = pd.read_csv(args.cis_output, sep='\t', index_col=0) 231 | summary_df.rename(columns={'minor_allele_samples':'ma_samples', 'minor_allele_count':'ma_count'}, inplace=True) 232 | if args.chunk_size is None: 233 | res_df = cis.map_independent(genotype_df, variant_df, summary_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df, 234 | group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window, 235 | maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True) 236 | else: 237 | res_df = [] 238 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size, 239 | dosages=args.dosages, verbose=True): 240 | res_df.append(cis.map_independent(gt_df, var_df, summary_df, p_df, p_pos_df, covariates_df=covariates_df, 241 | group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window, 242 | maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True)) 243 | res_df = pd.concat(res_df).reset_index(drop=True) 244 | logger.write(' * writing output') 245 | out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_independent_qtl.txt.gz') 246 | res_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g') 247 | 248 | elif args.mode == 'cis_susie': 249 | if args.cis_output.endswith('.parquet'): 250 | signif_df = pd.read_parquet(args.cis_output) 251 | else: 252 | signif_df = pd.read_csv(args.cis_output, sep='\t') 253 | if 'qval' in signif_df: # otherwise input is from get_significant_pairs 254 | signif_df = signif_df[signif_df['qval'] <= args.fdr] 255 | phenotype_ids = phenotype_df.index[phenotype_df.index.isin(signif_df['phenotype_id'].unique())] 256 | phenotype_df = phenotype_df.loc[phenotype_ids] 257 | phenotype_pos_df = phenotype_pos_df.loc[phenotype_ids] 258 | if args.chunk_size is None: 259 | summary_df, res = susie.map(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 260 | covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects, 261 | maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False) 262 | else: 263 | summary_df = [] 264 | res = {} 265 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size, 266 | dosages=args.dosages, verbose=True): 267 | chunk_summary_df, chunk_res = susie.map(gt_df, var_df, p_df, p_pos_df, 268 | covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects, 269 | maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False) 270 | summary_df.append(chunk_summary_df) 271 | res |= chunk_res 272 | summary_df = pd.concat(summary_df).reset_index(drop=True) 273 | 274 | summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet')) 275 | with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f: 276 | pickle.dump(res, f) 277 | 278 | elif args.mode == 'trans_susie': 279 | assert args.susie_loci is not None 280 | if args.susie_loci.endswith('.parquet'): 281 | locus_df = pd.read_parquet(args.susie_loci) 282 | else: 283 | locus_df = pd.read_csv(args.susie_loci, sep='\t') 284 | locus_df.rename(columns={'position':'pos'}, inplace=True) 285 | if args.chunk_size is None: 286 | assert variant_df is not None 287 | summary_df, res = susie.map_loci(locus_df, genotype_df, variant_df, phenotype_df, covariates_df, 288 | maf_threshold=maf_threshold, max_iter=500, window=args.window) 289 | else: 290 | raise NotImplementedError() 291 | 292 | summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet')) 293 | with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f: 294 | pickle.dump(res, f) 295 | 296 | elif args.mode == 'trans': 297 | return_sparse = not args.return_dense 298 | if return_sparse: 299 | logger.write(f' * p-value threshold: {args.pval_threshold:.2g}') 300 | 301 | if interaction_df is not None: 302 | if interaction_df.shape[1] > 1: 303 | raise NotImplementedError('trans-QTL mapping currently only supports a single interaction.') 304 | else: 305 | interaction_df = interaction_df.squeeze('columns') 306 | 307 | if args.chunk_size is None: 308 | pairs_df = trans.map_trans(genotype_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df, 309 | return_sparse=return_sparse, pval_threshold=args.pval_threshold, 310 | maf_threshold=maf_threshold, batch_size=args.batch_size, 311 | return_r2=args.return_r2, logger=logger) 312 | if args.return_dense: 313 | pval_df, b_df, b_se_df, af_s = pairs_df 314 | else: 315 | pairs_df = [] 316 | n, rem = np.divmod(pgr.num_variants, int(args.chunk_size)) 317 | bounds = [0] + n * [int(args.chunk_size)] 318 | if rem != 0: 319 | bounds.append(rem) 320 | bounds = np.cumsum(bounds) 321 | nchunks = len(bounds)-1 322 | for i in range(nchunks): 323 | print(f"Processing genotype chunk {i+1}/{nchunks}") 324 | if args.dosages: 325 | gt_df = pgr.read_dosages_range(bounds[i], bounds[i+1]-1, dtype=np.float32) 326 | else: 327 | gt_df = pgr.read_range(bounds[i], bounds[i+1]-1, impute_mean=False, dtype=np.int8) 328 | pairs_df.append(trans.map_trans(gt_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df, 329 | return_sparse=return_sparse, pval_threshold=args.pval_threshold, 330 | maf_threshold=maf_threshold, batch_size=args.batch_size, 331 | return_r2=args.return_r2, logger=logger)) 332 | pairs_df = pd.concat(pairs_df).reset_index(drop=True) 333 | variant_df = pgr.variant_df 334 | 335 | if return_sparse: 336 | if variant_df is not None and phenotype_pos_df is not None: 337 | logger.write(' * filtering out cis-QTLs (within +/-5Mb)') 338 | pairs_df = trans.filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000) 339 | 340 | logger.write(' * writing sparse output') 341 | if not args.output_text: 342 | pairs_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.parquet')) 343 | else: 344 | out_file = os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.txt.gz') 345 | pairs_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g') 346 | else: 347 | logger.write(' * writing dense output') 348 | pval_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pval.parquet')) 349 | b_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta.parquet')) 350 | b_se_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta_se.parquet')) 351 | af_s.to_frame().to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_af.parquet')) 352 | 353 | logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Finished mapping') 354 | 355 | 356 | if __name__ == '__main__': 357 | main() 358 | -------------------------------------------------------------------------------- /tensorqtl/trans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils import data 3 | import numpy as np 4 | import pandas as pd 5 | import scipy.stats as stats 6 | from collections import OrderedDict 7 | import sys 8 | import os 9 | import time 10 | 11 | sys.path.insert(1, os.path.dirname(__file__)) 12 | import genotypeio 13 | from core import * 14 | 15 | 16 | def _in_cis(chrom, pos, gene_id, pos_dict, window=1000000): 17 | """Test if a variant is within +/-window of a gene's TSS.""" 18 | if chrom == pos_dict[gene_id]['chr']: 19 | gene_dict = pos_dict[gene_id] 20 | if 'pos' in gene_dict: 21 | start = gene_dict['pos'] 22 | end = start 23 | else: 24 | start = gene_dict['start'] 25 | end = gene_dict['end'] 26 | if pos >= start - window and pos <= end + window: 27 | return True 28 | else: 29 | return False 30 | else: 31 | return False 32 | 33 | 34 | def filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000): 35 | """Filter out cis-QTLs 36 | 37 | Args: 38 | pairs_df: sparse output from map_trans() 39 | pos_dict: phenotype_id -> pos 40 | window: filter variants within +/-window of feature position (e.g., TSS for genes) 41 | """ 42 | pos_dict = phenotype_pos_df.T.to_dict() 43 | variant_df = variant_df.loc[pairs_df['variant_id'].unique()].copy() 44 | variant_dict = {v:{'chrom':c, 'pos':p} for v,c,p in zip(variant_df.index, variant_df['chrom'], variant_df['pos'])} 45 | 46 | drop_ix = [] 47 | for k,gene_id,variant_id in zip(pairs_df['phenotype_id'].index, pairs_df['phenotype_id'], pairs_df['variant_id']): 48 | if _in_cis(variant_dict[variant_id]['chrom'], variant_dict[variant_id]['pos'], gene_id, pos_dict, window=window): 49 | drop_ix.append(k) 50 | return pairs_df.drop(drop_ix) 51 | 52 | 53 | def map_trans(genotype_df, phenotype_df, covariates_df=None, interaction_s=None, 54 | return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05, 55 | alleles=2, return_r2=False, batch_size=20000, 56 | logp=False, logger=None, verbose=True): 57 | """Run trans-QTL mapping 58 | 59 | Outputs (return_sparse == True): 60 | pval_df: DataFrame with columns variant_id, phenotype_id, pval, b, b_se, af 61 | Outputs (return_sparse == False): 62 | pval_df 63 | b_df 64 | b_se_df 65 | af_s 66 | """ 67 | 68 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 69 | 70 | if logger is None: 71 | logger = SimpleLogger(verbose=verbose) 72 | 73 | variant_ids = genotype_df.index.tolist() 74 | variant_dict = {i:j for i,j in enumerate(variant_ids)} 75 | n_variants = len(variant_ids) 76 | n_samples = phenotype_df.shape[1] 77 | dof = n_samples - 2 78 | 79 | logger.write('trans-QTL mapping') 80 | logger.write(f' * {n_samples} samples') 81 | logger.write(f' * {phenotype_df.shape[0]} phenotypes') 82 | if covariates_df is not None: 83 | assert np.all(phenotype_df.columns==covariates_df.index) 84 | logger.write(f' * {covariates_df.shape[1]} covariates') 85 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) 86 | dof -= covariates_df.shape[1] 87 | else: 88 | residualizer = None 89 | logger.write(f' * {n_variants} variants') 90 | if interaction_s is not None: 91 | logger.write(' * including interaction term') 92 | 93 | phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device) 94 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns]) 95 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device) 96 | 97 | # calculate correlation threshold for sparse output 98 | if return_sparse: 99 | tstat_threshold = -stats.t.ppf(pval_threshold/2, dof) 100 | r_threshold = tstat_threshold / np.sqrt(dof + tstat_threshold**2) 101 | else: 102 | tstat_threshold = None 103 | r_threshold = None 104 | 105 | if interaction_s is None: 106 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size) 107 | start_time = time.time() 108 | res = [] 109 | n_variants = 0 110 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1): 111 | # copy genotypes to GPU 112 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 113 | 114 | # filter by MAF 115 | genotypes_t = genotypes_t[:,genotype_ix_t] 116 | impute_mean(genotypes_t) 117 | genotypes_t, variant_ids, af_t = filter_maf(genotypes_t, variant_ids, maf_threshold) 118 | n_variants += genotypes_t.shape[0] 119 | 120 | r_t, genotype_var_t, phenotype_var_t = calculate_corr(genotypes_t, phenotypes_t, residualizer=residualizer, return_var=True) 121 | del genotypes_t 122 | 123 | if return_sparse: 124 | m = r_t.abs() >= r_threshold 125 | ix_t = m.nonzero(as_tuple=False) # sparse index 126 | ix = ix_t.cpu().numpy() 127 | 128 | r_t = r_t.masked_select(m).type(torch.float64) 129 | r2_t = r_t.pow(2) 130 | tstat_t = r_t * torch.sqrt(dof / (1 - r2_t)) 131 | std_ratio_t = torch.sqrt(phenotype_var_t[ix_t[:,1]] / genotype_var_t[ix_t[:,0]]) 132 | b_t = r_t * std_ratio_t 133 | b_se_t = (b_t / tstat_t).type(torch.float32) 134 | 135 | res.append(np.c_[ 136 | variant_ids[ix[:,0]], phenotype_df.index[ix[:,1]], 137 | tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(), 138 | r2_t.float().cpu(), af_t[ix_t[:,0]].cpu() 139 | ]) 140 | else: # dense output: pval, b, b_se, af 141 | r_t = r_t.type(torch.float64) 142 | tstat_t = r_t * torch.sqrt(dof / (1 - r_t.pow(2))) 143 | std_ratio_t = torch.sqrt(phenotype_var_t / genotype_var_t.reshape(-1,1)) 144 | b_t = (r_t * std_ratio_t).type(torch.float32) 145 | b_se_t = (b_t / tstat_t).type(torch.float32) 146 | res.append([variant_ids, tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(), af_t.cpu()]) 147 | 148 | logger.write(f' elapsed time: {(time.time()-start_time)/60:.2f} min') 149 | del phenotypes_t 150 | del residualizer 151 | 152 | if maf_threshold > 0: 153 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering') 154 | 155 | # post-processing: concatenate batches 156 | if return_sparse: 157 | res = np.concatenate(res) 158 | res[:,2] = get_t_pval(res[:,2].astype(np.float64), dof, log=logp) 159 | pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'pval', 'b', 'b_se', 'r2', 'af']) 160 | pval_df['pval'] = pval_df['pval'].astype(np.float64) 161 | pval_df['b'] = pval_df['b'].astype(np.float32) 162 | pval_df['b_se'] = pval_df['b_se'].astype(np.float32) 163 | pval_df['r2'] = pval_df['r2'].astype(np.float32) 164 | pval_df['af'] = pval_df['af'].astype(np.float32) 165 | if not return_r2: 166 | pval_df.drop('r2', axis=1, inplace=True) 167 | logger.write('done.') 168 | return pval_df 169 | else: 170 | variant_ids = pd.Series(np.concatenate([i[0] for i in res]), name='variant_id') 171 | pval_df = pd.DataFrame(get_t_pval(np.concatenate([i[1] for i in res]).astype(np.float64), dof, log=logp), 172 | index=variant_ids, columns=phenotype_df.index) 173 | b_df = pd.DataFrame(np.concatenate([i[2] for i in res]), 174 | index=variant_ids, columns=phenotype_df.index) 175 | b_se_df = pd.DataFrame(np.concatenate([i[3] for i in res]), 176 | index=variant_ids, columns=phenotype_df.index) 177 | af_s = pd.Series(np.concatenate([i[4] for i in res]), 178 | index=variant_ids, name='af') 179 | logger.write('done.') 180 | return pval_df, b_df, b_se_df, af_s 181 | 182 | 183 | else: # interaction model 184 | dof = n_samples - 4 - covariates_df.shape[1] 185 | interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device) # 1 x n_samples 186 | mask_s = pd.Series(True, index=interaction_s.index) 187 | mask_s[interaction_s.sort_values(kind='mergesort').index[:interaction_s.shape[0]//2]] = False 188 | interaction_mask_t = torch.BoolTensor(mask_s.values).to(device) 189 | 190 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size) 191 | start_time = time.time() 192 | if return_sparse: 193 | 194 | nps = phenotypes_t.shape[0] 195 | i0_t = interaction_t - interaction_t.mean() 196 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True) 197 | p0_t = residualizer.transform(p0_t, center=False) 198 | i0_t = residualizer.transform(i0_t, center=False) 199 | 200 | tstat_g_list = [] 201 | tstat_i_list = [] 202 | tstat_gi_list = [] 203 | af_list = [] 204 | ix0 = [] 205 | ix1 = [] 206 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1): 207 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 208 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t], 209 | interaction_mask_t=interaction_mask_t, 210 | maf_threshold_interaction=maf_threshold) 211 | if genotypes_t.shape[0] > 0: 212 | ng, ns = genotypes_t.shape 213 | 214 | # calculate allele frequency 215 | af_t = genotypes_t.sum(1) / (2*ns) 216 | 217 | # centered inputs 218 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True) 219 | gi_t = genotypes_t * interaction_t 220 | gi0_t = gi_t - gi_t.mean(1, keepdim=True) 221 | # residualize rows 222 | g0_t = residualizer.transform(g0_t, center=False) 223 | gi0_t = residualizer.transform(gi0_t, center=False) 224 | 225 | # regression 226 | X_t = torch.stack([g0_t, i0_t.repeat(ng, 1), gi0_t], 2) # ng x ns x 3 227 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3 228 | b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), p0_t.t()) # ng x 3 x np 229 | dof = residualizer.dof - 2 230 | 231 | rss_t = (torch.matmul(X_t, b_t) - p0_t.t()).pow(2).sum(1) # ng x np 232 | b_se_t = torch.sqrt(Xinv[:, torch.eye(3, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof) 233 | tstat_t = (b_t.double() / b_se_t.double()).float() # (ng x 3 x np) 234 | tstat_g_t = tstat_t[:,0,:] # genotypes x phenotypes 235 | tstat_i_t = tstat_t[:,1,:] 236 | tstat_gi_t = tstat_t[:,2,:] 237 | m = tstat_gi_t.abs() >= tstat_threshold 238 | tstat_g_t = tstat_g_t[m] 239 | tstat_i_t = tstat_i_t[m] 240 | tstat_gi_t = tstat_gi_t[m] 241 | ix = m.nonzero(as_tuple=False) # indexes: [genotype, phenotype] 242 | af_t = af_t[ix[:,0]] 243 | 244 | res = [tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix] 245 | tstat_g, tstat_i, tstat_gi, af, ix = [i.cpu().numpy() for i in res] 246 | mask = mask_t.cpu().numpy() 247 | # convert sparse indexes 248 | if len(ix)>0: 249 | variant_ids = variant_ids[mask.astype(bool)] 250 | tstat_g_list.append(tstat_g) 251 | tstat_i_list.append(tstat_i) 252 | tstat_gi_list.append(tstat_gi) 253 | af_list.append(af) 254 | ix0.extend(variant_ids[ix[:,0]].tolist()) 255 | ix1.extend(phenotype_df.index[ix[:,1]].tolist()) 256 | 257 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') 258 | 259 | # concatenate 260 | pval_g = get_t_pval(np.concatenate(tstat_g_list), dof, log=logp) 261 | pval_i = get_t_pval(np.concatenate(tstat_i_list), dof, log=logp) 262 | pval_gi = get_t_pval(np.concatenate(tstat_gi_list), dof, log=logp) 263 | af = np.concatenate(af_list) 264 | 265 | pval_df = pd.DataFrame(np.c_[ix0, ix1, pval_g, pval_i, pval_gi, af], 266 | columns=['variant_id', 'phenotype_id', 'pval_g', 'pval_i', 'pval_gi', 'af'] 267 | ).astype({'pval_g':np.float64, 'pval_i':np.float64, 'pval_gi':np.float64, 'af':np.float32}) 268 | return pval_df 269 | else: # dense output 270 | output_list = [] 271 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1): 272 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 273 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t], 274 | interaction_mask_t=interaction_mask_t, 275 | maf_threshold_interaction=maf_threshold) 276 | res = calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t.t(), residualizer, 277 | return_sparse=return_sparse) 278 | # res: tstat, b, b_se, af, ma_samples, ma_count 279 | res = [i.cpu().numpy() for i in res] 280 | mask = mask_t.cpu().numpy() 281 | variant_ids = variant_ids[mask.astype(bool)] 282 | output_list.append(res + [variant_ids]) 283 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') 284 | 285 | # concatenate outputs 286 | tstat = np.concatenate([i[0] for i in output_list]) 287 | pval = get_t_pval(tstat, dof, log=logp) 288 | b = np.concatenate([i[1] for i in output_list]) 289 | b_se = np.concatenate([i[2] for i in output_list]) 290 | af = np.concatenate([i[3] for i in output_list]) 291 | ma_samples = np.concatenate([i[4] for i in output_list]) 292 | ma_count = np.concatenate([i[5] for i in output_list]) 293 | variant_ids = np.concatenate([i[6] for i in output_list]) 294 | 295 | pval_g_df = pd.DataFrame(pval[:,0,:], index=variant_ids, columns=phenotype_df.index) 296 | pval_i_df = pd.DataFrame(pval[:,1,:], index=variant_ids, columns=phenotype_df.index) 297 | pval_gi_df = pd.DataFrame(pval[:,2,:], index=variant_ids, columns=phenotype_df.index) 298 | af_s = pd.Series(af, index=variant_ids, name='af').astype(np.float32) 299 | ma_samples_s = pd.Series(ma_samples, index=variant_ids, name='ma_samples').astype(np.int32) 300 | ma_count_s = pd.Series(ma_count, index=variant_ids, name='ma_counts').astype(np.int32) 301 | return pval_g_df, pval_i_df, pval_gi_df, af_s, ma_samples_s, ma_count_s 302 | 303 | 304 | def map_permutations(genotype_df, covariates_df, permutations=None, 305 | chr_s=None, nperms=10000, maf_threshold=0.05, 306 | batch_size=20000, logger=None, seed=None, verbose=True): 307 | """ 308 | 309 | 310 | Warning: this function assumes that all phenotypes are normally distributed, 311 | e.g., inverse normal transformed 312 | """ 313 | 314 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 315 | 316 | if logger is None: 317 | logger = SimpleLogger() 318 | assert covariates_df.index.isin(genotype_df.columns).all() 319 | sample_ids = covariates_df.index.values 320 | 321 | variant_ids = genotype_df.index.tolist() 322 | 323 | # index of VCF samples corresponding to phenotypes 324 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids]) 325 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device) 326 | 327 | n_variants = len(variant_ids) 328 | n_samples = len(sample_ids) 329 | dof = n_samples - 2 - covariates_df.shape[1] 330 | 331 | logger.write('trans-QTL mapping (permutations)') 332 | logger.write(f' * {n_samples} samples') 333 | logger.write(f' * {covariates_df.shape[1]} covariates') 334 | logger.write(f' * {n_variants} variants') 335 | 336 | if permutations is None: # generate permutations assuming normal distribution 337 | q = stats.norm.ppf(np.arange(1,n_samples+1)/(n_samples+1)) 338 | permutations = np.tile(q,[nperms,1]) 339 | if seed is not None: 340 | np.random.seed(seed) 341 | for i in np.arange(nperms): 342 | np.random.shuffle(permutations[i,:]) 343 | else: 344 | assert permutations.shape[1]==n_samples 345 | nperms = permutations.shape[0] 346 | logger.write(f' * {nperms} permutations') 347 | 348 | permutations_t = torch.tensor(permutations, dtype=torch.float32).to(device) 349 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device)) 350 | 351 | if chr_s is not None: 352 | assert chr_s.index.equals(genotype_df.index) 353 | start_time = time.time() 354 | n_variants = 0 355 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size, chr_s=chr_s) 356 | total_batches = np.sum([len(ggt.chr_batch_indexes[c]) for c in ggt.chroms]) 357 | 358 | chr_max_r2 = OrderedDict() 359 | k = 0 360 | for chrom in ggt.chroms: 361 | max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device) 362 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(chrom=chrom, verbose=verbose, enum_start=k+1), k+1): 363 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 364 | genotypes_t = genotypes_t[:, genotype_ix_t] 365 | impute_mean(genotypes_t) 366 | genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold) 367 | n_variants += genotypes_t.shape[0] 368 | 369 | r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2) 370 | del genotypes_t 371 | m,_ = r2_t.max(0) 372 | max_r2_t = torch.max(m, max_r2_t) 373 | chr_max_r2[chrom] = max_r2_t.cpu() 374 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') 375 | if maf_threshold > 0: 376 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering') 377 | chr_max_r2 = pd.DataFrame(chr_max_r2) 378 | 379 | # leave-one-out max 380 | max_r2 = OrderedDict() 381 | for c in chr_max_r2: 382 | max_r2[c] = chr_max_r2[np.setdiff1d(chr_max_r2.columns, c)].max(1) 383 | max_r2 = pd.DataFrame(max_r2) # nperms x chrs 384 | 385 | # empirical p-values 386 | tstat = np.sqrt( dof*max_r2 / (1-max_r2) ) 387 | minp_empirical = pd.DataFrame(2*stats.t.cdf(-np.abs(tstat), dof), columns=tstat.columns) # nperms x chrs 388 | 389 | beta_shape1 = OrderedDict() 390 | beta_shape2 = OrderedDict() 391 | true_dof = OrderedDict() 392 | minp_vec = OrderedDict() 393 | for c in max_r2: 394 | beta_shape1[c], beta_shape2[c], true_dof[c], minp_vec[c] = fit_beta_parameters(max_r2[c], dof, return_minp=True) 395 | 396 | beta_df = pd.DataFrame(OrderedDict([ 397 | ('beta_shape1', beta_shape1), 398 | ('beta_shape2', beta_shape2), 399 | ('true_df', true_dof), 400 | ('minp_true_df', minp_vec), 401 | ('minp_empirical', {c:minp_empirical[c].values for c in minp_empirical}), 402 | ])) 403 | return beta_df 404 | 405 | else: # not split_chr 406 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size) 407 | start_time = time.time() 408 | max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device) 409 | n_variants = 0 410 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1): 411 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device) 412 | genotypes_t = genotypes_t[:, genotype_ix_t] 413 | impute_mean(genotypes_t) 414 | genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold) 415 | n_variants += genotypes_t.shape[0] 416 | 417 | r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2) 418 | del genotypes_t 419 | m,_ = r2_t.max(0) 420 | max_r2_t = torch.max(m, max_r2_t) 421 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min') 422 | if maf_threshold > 0: 423 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering') 424 | max_r2 = max_r2_t.cpu().numpy().astype(np.float64) 425 | tstat = np.sqrt( dof*max_r2 / (1-max_r2) ) 426 | minp_empirical = 2*stats.t.cdf(-np.abs(tstat), dof) 427 | beta_shape1, beta_shape2, true_dof, minp_vec = fit_beta_parameters(max_r2, dof, tol=1e-4, return_minp=True) 428 | 429 | beta_s = pd.Series([n_samples, dof, beta_shape1, beta_shape2, true_dof, minp_vec, minp_empirical], 430 | index=['num_samples', 'df', 'beta_shape1', 'beta_shape2', 'true_df', 'minp_true_df', 'minp_empirical']) 431 | return beta_s 432 | 433 | 434 | def apply_permutations(res, pairs_df): 435 | """ 436 | res: output from map_permutations() 437 | pairs_df: output from map_trans() 438 | """ 439 | 440 | if isinstance(res, pd.Series): # chrs not split 441 | nperms = len(res['minp_true_df']) 442 | for k in ['beta_shape1', 'beta_shape2', 'true_df']: 443 | pairs_df[k] = res[k] 444 | pairs_df['pval_true_dof'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df']) 445 | pairs_df['pval_perm'] = np.array([(np.sum(res['minp_empirical']<=p)+1)/(nperms+1) for p in pairs_df['pval']]) 446 | pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_dof'], pairs_df['beta_shape1'], pairs_df['beta_shape2']) 447 | 448 | elif isinstance(res, pd.DataFrame): # chrs split 449 | nperms = len(res['minp_empirical'][0]) 450 | for k in ['beta_shape1', 'beta_shape2', 'true_df']: 451 | pairs_df[k] = res.loc[pairs_df['phenotype_chr'], k].values 452 | pairs_df['pval_true_df'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df']) 453 | pairs_df['pval_perm'] = [(np.sum(pe<=p)+1)/(nperms+1) for p,pe in zip(pairs_df['pval'], res.loc[pairs_df['phenotype_chr'], 'minp_empirical'])] 454 | # pval_perm = np.array([(np.sum(minp_empirical[chrom]<=p)+1)/(nperms+1) for p, chrom in zip(pval_df['pval'], pval_df['phenotype_chr'])]) 455 | # pval_perm = np.array([(np.sum(minp_empirical<=p)+1)/(nperms+1) for p in minp_nominal]) 456 | pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_df'], pairs_df['beta_shape1'], pairs_df['beta_shape2']) 457 | --------------------------------------------------------------------------------