├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docs
└── outputs.md
├── example
├── GTEx_v8_example.ipynb
├── data
│ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen
│ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam
│ ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pvar
│ ├── GEUVADIS.445_samples.covariates.txt
│ └── GEUVADIS.445_samples.expression.bed.gz
└── tensorqtl_examples.ipynb
├── install
├── INSTALL.md
├── install_cuda.sh
└── tensorqtl_env.yml
├── pyproject.toml
└── tensorqtl
├── __init__.py
├── __main__.py
├── cis.py
├── coloc.py
├── core.py
├── eigenmt.py
├── genotypeio.py
├── mixqtl.py
├── pgen.py
├── post.py
├── rfunc.py
├── susie.py
├── tensorqtl.py
└── trans.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.egg-info/
3 | *.ipynb_checkpoints/
4 | build/
5 | dist/
6 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile for tensorQTL
2 | # https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/unsupported-tags.md
3 | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
4 | MAINTAINER Francois Aguet
5 |
6 | RUN apt-get update && apt-get install -y software-properties-common && \
7 | apt-get update && apt-get install -y \
8 | apt-transport-https \
9 | build-essential \
10 | cmake \
11 | curl \
12 | libboost-all-dev \
13 | libbz2-dev \
14 | libcurl3-dev \
15 | liblzma-dev \
16 | libncurses5-dev \
17 | libssl-dev \
18 | python3 \
19 | python3-pip \
20 | sudo \
21 | unzip \
22 | wget \
23 | zlib1g-dev \
24 | && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
25 | apt-get clean && \
26 | apt-get autoremove -y && \
27 | rm -rf /var/lib/{apt,dpkg,cache,log}/
28 |
29 | # htslib
30 | RUN cd /opt && \
31 | wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19/htslib-1.19.tar.bz2 && \
32 | tar -xf htslib-1.19.tar.bz2 && rm htslib-1.19.tar.bz2 && cd htslib-1.19 && \
33 | ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \
34 | make && make install && make clean
35 |
36 | # bcftools
37 | RUN cd /opt && \
38 | wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \
39 | tar -xf bcftools-1.19.tar.bz2 && rm bcftools-1.19.tar.bz2 && cd bcftools-1.19 && \
40 | ./configure --with-htslib=system && make && make install && make clean
41 |
42 | # install R
43 | ENV DEBIAN_FRONTEND noninteractive
44 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
45 | RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/'
46 | RUN apt update && apt install -y r-base r-base-dev
47 | ENV R_LIBS_USER=/opt/R/4.0
48 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) {install.packages("BiocManager")}; BiocManager::install("qvalue");'
49 |
50 | # python modules
51 | RUN pip3 install --upgrade pip setuptools
52 | RUN pip3 install numpy pandas scipy
53 | RUN pip3 install pandas-plink ipython jupyter matplotlib pyarrow torch rpy2 gcsfs Pgenlib>=0.90.1
54 | RUN pip3 install tensorqtl==1.0.9
55 |
56 | # RUN cd /opt && \
57 | # wget https://github.com/broadinstitute/tensorqtl/archive/v1.0.8.tar.gz && \
58 | # tar -xf v1.0.8.tar.gz && mv tensorqtl-1.0.8 tensorqtl && \
59 | # rm v1.0.8.tar.gz
60 | # RUN pip3 install /opt/tensorqtl/
61 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018-2019, The Broad Institute, Inc. and The General Hospital Corporation.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## tensorQTL
2 |
3 | tensorQTL is a GPU-enabled QTL mapper, achieving ~200-300 fold faster *cis*- and *trans*-QTL mapping compared to CPU-based implementations.
4 |
5 | If you use tensorQTL in your research, please cite the following paper:
6 | [Taylor-Weiner, Aguet, et al., *Genome Biol.*, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7).
7 | Empirical beta-approximated p-values are computed as described in [Ongen et al., *Bioinformatics*, 2016](https://academic.oup.com/bioinformatics/article/32/10/1479/1742545).
8 |
9 | ### Install
10 | You can install tensorQTL using pip:
11 | ```
12 | pip3 install tensorqtl
13 | ```
14 | or directly from this repository:
15 | ```
16 | $ git clone git@github.com:broadinstitute/tensorqtl.git
17 | $ cd tensorqtl
18 | # install into a new virtual environment and load
19 | $ mamba env create -f install/tensorqtl_env.yml
20 | $ conda activate tensorqtl
21 | ```
22 | To install the latest version from this repository, run
23 | ```
24 | pip install pip@git+https://github.com/broadinstitute/tensorqtl.git
25 | ```
26 |
27 | To use PLINK 2 binary files ([pgen/pvar/psam](https://www.cog-genomics.org/plink/2.0/input#pgen)), [pgenlib](https://github.com/chrchang/plink-ng/tree/master/2.0/Python) must be installed using either
28 | ```
29 | pip install Pgenlib
30 | ```
31 | (this is included in `tensorqtl_env.yml` above), or from the source :
32 | ```
33 | git clone git@github.com:chrchang/plink-ng.git
34 | cd plink-ng/2.0/Python/
35 | python3 setup.py build_ext
36 | python3 setup.py install
37 | ```
38 |
39 | ### Requirements
40 |
41 | tensorQTL requires an environment configured with a GPU for optimal performance, but can also be run on a CPU. Instructions for setting up a virtual machine on Google Cloud Platform are provided [here](install/INSTALL.md).
42 |
43 | ### Input formats
44 | Three inputs are required for QTL analyses with tensorQTL: genotypes, phenotypes, and covariates.
45 | * Phenotypes must be provided in BED format, with a single header line starting with `#` and the first four columns corresponding to: `chr`, `start`, `end`, `phenotype_id`, with the remaining columns corresponding to samples (the identifiers must match those in the genotype input). In addition to .bed/.bed.gz, BED input in .parquet is also supported. The BED file can specify the center of the *cis*-window (usually the TSS), with `start == end-1`, or alternatively, start and end positions, in which case the *cis*-window is [start-window, end+window]. A function for generating a BED template from a gene annotation in GTF format is available in [pyqtl](https://github.com/broadinstitute/pyqtl) (`io.gtf_to_tss_bed`).
46 | * Covariates can be provided as a tab-delimited text file (covariates x samples) or dataframe (samples x covariates), with row and column headers.
47 | * Genotypes should preferrably be in [PLINK2](https://www.cog-genomics.org/plink/2.0/) pgen/pvar/psam format, which can be generated from a VCF as follows:
48 | ```
49 | plink2 \
50 | --output-chr chrM \
51 | --vcf ${plink_prefix_path}.vcf.gz \
52 | --out ${plink_prefix_path}
53 | ```
54 | If using `--make-bed` with PLINK 1.9 or earlier, add the `--keep-allele-order` flag.
55 |
56 | Alternatively, the genotypes can be provided in bed/bim/fam format, or as a parquet dataframe (genotypes x samples).
57 |
58 |
59 | The [examples notebook](example/tensorqtl_examples.ipynb) below contains examples of all input files. The input formats for phenotypes and covariates are identical to those used by [FastQTL](https://github.com/francois-a/fastqtl).
60 |
61 | ### Examples
62 | For examples illustrating *cis*- and *trans*-QTL mapping, please see [tensorqtl_examples.ipynb](example/tensorqtl_examples.ipynb).
63 |
64 | ### Running tensorQTL
65 | This section describes how to run the different modes of tensorQTL, both from the command line and within Python.
66 | For a full list of options, run
67 | ```
68 | python3 -m tensorqtl --help
69 | ```
70 |
71 | #### Loading input files
72 | This section is only relevant when running tensorQTL in Python.
73 | The following imports are required:
74 | ```
75 | import pandas as pd
76 | import tensorqtl
77 | from tensorqtl import genotypeio, cis, trans
78 | ```
79 | Phenotypes and covariates can be loaded as follows:
80 | ```
81 | phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(phenotype_bed_file)
82 | covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T # samples x covariates
83 | ```
84 | Genotypes can be loaded as follows, where `plink_prefix_path` is the path to the VCF in PLINK format (excluding `.bed`/`.bim`/`.fam` extensions):
85 | ```
86 | pr = genotypeio.PlinkReader(plink_prefix_path)
87 | # load genotypes and variants into data frames
88 | genotype_df = pr.load_genotypes()
89 | variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
90 | ```
91 | To save memory when using genotypes for a subset of samples, a subset of samples can be loaded (this is not strictly necessary, since tensorQTL will select the relevant samples from `genotype_df` otherwise):
92 | ```
93 | pr = genotypeio.PlinkReader(plink_prefix_path, select_samples=phenotype_df.columns)
94 | ```
95 |
96 | #### *cis*-QTL mapping: permutations
97 | This is the main mode for *cis*-QTL mapping. It generates phenotype-level summary statistics with empirical p-values, enabling calculation of genome-wide FDR.
98 | In Python:
99 | ```
100 | cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df)
101 | tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)
102 | ```
103 | Shell command:
104 | ```
105 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
106 | --covariates ${covariates_file} \
107 | --mode cis
108 | ```
109 | `${prefix}` specifies the output file name.
110 |
111 | #### *cis*-QTL mapping: summary statistics for all variant-phenotype pairs
112 | In Python:
113 | ```
114 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
115 | prefix, covariates_df, output_dir='.')
116 | ```
117 | Shell command:
118 | ```
119 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
120 | --covariates ${covariates_file} \
121 | --mode cis_nominal
122 | ```
123 | The results are written to a [parquet](https://parquet.apache.org/) file for each chromosome. These files can be read using `pandas`:
124 | ```
125 | df = pd.read_parquet(file_name)
126 | ```
127 | #### *cis*-QTL mapping: conditionally independent QTLs
128 | This mode maps conditionally independent *cis*-QTLs using the stepwise regression procedure described in [GTEx Consortium, 2017](https://www.nature.com/articles/nature24277). The output from the permutation step (see `map_cis` above) is required.
129 | In Python:
130 | ```
131 | indep_df = cis.map_independent(genotype_df, variant_df, cis_df,
132 | phenotype_df, phenotype_pos_df, covariates_df)
133 | ```
134 | Shell command:
135 | ```
136 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
137 | --covariates ${covariates_file} \
138 | --cis_output ${prefix}.cis_qtl.txt.gz \
139 | --mode cis_independent
140 | ```
141 |
142 | #### *cis*-QTL mapping: interactions
143 | Instead of mapping the standard linear model (p ~ g), this mode includes an interaction term (p ~ g + i + gi) and returns full summary statistics for the model. The interaction term is a tab-delimited text file or dataframe mapping sample ID to interaction value(s) (if multiple interactions are used, the file must include a header with variable names). With the `run_eigenmt=True` option, [eigenMT](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)-adjusted p-values are computed.
144 | In Python:
145 | ```
146 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix,
147 | covariates_df=covariates_df,
148 | interaction_df=interaction_df, maf_threshold_interaction=0.05,
149 | run_eigenmt=True, output_dir='.', write_top=True, write_stats=True)
150 | ```
151 | The input options `write_top` and `write_stats` control whether the top association per phenotype and full summary statistics, respectively, are written to file.
152 |
153 | Shell command:
154 | ```
155 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
156 | --covariates ${covariates_file} \
157 | --interaction ${interactions_file} \
158 | --best_only \
159 | --mode cis_nominal
160 | ```
161 | The option `--best_only` disables output of full summary statistics.
162 |
163 | Full summary statistics are saved as [parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html) files for each chromosome, in `${output_dir}/${prefix}.cis_qtl_pairs.${chr}.parquet`, and the top association for each phenotype is saved to `${output_dir}/${prefix}.cis_qtl_top_assoc.txt.gz`. In these files, the columns `b_g`, `b_g_se`, `pval_g` are the effect size, standard error, and p-value of *g* in the model, with matching columns for *i* and *gi*. In the `*.cis_qtl_top_assoc.txt.gz` file, `tests_emt` is the effective number of independent variants in the cis-window estimated with eigenMT, i.e., based on the eigenvalue decomposition of the regularized genotype correlation matrix ([Davis et al., AJHG, 2016](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)). `pval_emt = pval_gi * tests_emt`, and `pval_adj_bh` are the Benjamini-Hochberg adjusted p-values corresponding to `pval_emt`.
164 |
165 | #### *trans*-QTL mapping
166 | This mode computes nominal associations between all phenotypes and genotypes. tensorQTL generates sparse output by default (associations with p-value < 1e-5). *cis*-associations are filtered out. The output is in parquet format, with four columns: phenotype_id, variant_id, pval, maf.
167 | In Python:
168 | ```
169 | trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df,
170 | return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05,
171 | batch_size=20000)
172 | # remove cis-associations
173 | trans_df = trans.filter_cis(trans_df, phenotype_pos_df.T.to_dict(), variant_df, window=5000000)
174 | ```
175 | Shell command:
176 | ```
177 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
178 | --covariates ${covariates_file} \
179 | --mode trans
180 | ```
181 |
182 |
--------------------------------------------------------------------------------
/docs/outputs.md:
--------------------------------------------------------------------------------
1 | ### Output files
2 | #### Mode `cis_nominal`
3 | Column | Description
4 | --- | ---
5 | `phenotype_id` | Phenotype ID
6 | `variant_id` | Variant ID
7 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS)
8 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position)
9 | `af` | In-sample ALT allele frequency of the variant
10 | `ma_samples` | Number of samples carrying at least on minor allele
11 | `ma_count` | Number of minor alleles
12 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant
13 | `slope` | Regression slope
14 | `slope_se` | Standard error of the regression slope
15 |
16 | #### Mode `cis_nominal`, with interaction term
17 | When an interaction term is included, the output additionally contains the following columns instead of `pval_nominal`, `slope`, `slope_se`:
18 | Column | Description
19 | --- | ---
20 | `pval_g` | Nominal p-value of the genotype term
21 | `b_g` | Slope of the genotype term
22 | `b_g_se` | Standard error of `b_g`
23 | `pval_i` | Nominal p-value of the interaction variable
24 | `b_i` | Slope of the interaction variable
25 | `b_i_se` | Standard error of `b_i`
26 | `pval_gi` | Nominal p-value of the interaction term
27 | `b_gi` | Slope of the interaction term
28 | `b_gi_se` | Standard error of `b_gi`
29 | `tests_emt` | Effective number of independent variants (Meff) estimated by eigenMT
30 | `pval_emt` | Bonferroni-adjusted `pval_gi` (i.e., multiplied by Meff)
31 | `pval_adj_bh` | Benjamini-Hochberg adjusted `pval_emt`
32 |
33 | #### Mode `cis`
34 | Column | Description
35 | --- | ---
36 | `phenotype_id` | Phenotype ID
37 | `num_var` | Number of variants in *cis*-window
38 | `beta_shape1` | Parameter of the fitted Beta distribution
39 | `beta_shape2` | Parameter of the fitted Beta distribution
40 | `true_df` | Degrees of freedom used to compute p-values
41 | `pval_true_df` | Nominal p-value based on `true_df`
42 | `variant_id` | Variant ID
43 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS)
44 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position)
45 | `ma_samples` | Number of samples carrying at least on minor allele
46 | `ma_count` | Number of minor alleles
47 | `af` | In-sample ALT allele frequency of the variant
48 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant
49 | `slope` | Regression slope
50 | `slope_se` | Standard error of the regression slope
51 | `pval_perm` | Empirical p-value from permutations
52 | `pval_beta` | Beta-approximated empirical p-value
53 | `qval` | Storey q-value corresponding to `pval_beta`
54 | `pval_nominal_threshold` | Nominal p-value threshold for significant associations with the phenotype
55 |
56 | #### Mode `cis_independent`
57 | The columns are the same as for `cis`, excluding `qval` and `pval_nominal_threshold`, and adding:
58 | Column | Description
59 | --- | ---
60 | `rank` | Rank of the variant for the phenotype
61 |
62 | #### Mode `trans`
63 | Column | Description
64 | --- | ---
65 | `variant_id` | Variant ID
66 | `phenotype_id` | Phenotype ID
67 | `pval` | Nominal p-value of the association between the phenotype and variant
68 | `b` | Regression slope
69 | `b_se` | Standard error of the regression slope
70 | `r2` | Squared residual genotype-phenotype correlation (only generated if `map_trans(..., return_r2=True)`)
71 | `af` | In-sample ALT allele frequency of the variant
72 |
--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen
--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam:
--------------------------------------------------------------------------------
1 | #IID SEX
2 | HG00096 1
3 | HG00097 2
4 | HG00099 2
5 | HG00100 2
6 | HG00101 1
7 | HG00102 2
8 | HG00103 1
9 | HG00105 1
10 | HG00106 2
11 | HG00108 1
12 | HG00109 1
13 | HG00110 2
14 | HG00111 2
15 | HG00112 1
16 | HG00114 1
17 | HG00115 1
18 | HG00116 1
19 | HG00117 1
20 | HG00118 2
21 | HG00119 1
22 | HG00120 2
23 | HG00121 2
24 | HG00122 2
25 | HG00123 2
26 | HG00125 2
27 | HG00126 1
28 | HG00127 2
29 | HG00128 2
30 | HG00129 1
31 | HG00130 2
32 | HG00131 1
33 | HG00132 2
34 | HG00133 2
35 | HG00136 1
36 | HG00137 2
37 | HG00138 1
38 | HG00139 1
39 | HG00141 1
40 | HG00142 1
41 | HG00143 1
42 | HG00145 1
43 | HG00146 2
44 | HG00148 1
45 | HG00149 1
46 | HG00150 2
47 | HG00151 1
48 | HG00154 2
49 | HG00155 1
50 | HG00157 1
51 | HG00158 2
52 | HG00159 1
53 | HG00160 1
54 | HG00171 2
55 | HG00173 2
56 | HG00174 2
57 | HG00176 2
58 | HG00177 2
59 | HG00178 2
60 | HG00179 2
61 | HG00180 2
62 | HG00181 1
63 | HG00182 1
64 | HG00183 1
65 | HG00185 1
66 | HG00186 1
67 | HG00187 1
68 | HG00188 1
69 | HG00189 1
70 | HG00231 2
71 | HG00232 2
72 | HG00233 2
73 | HG00234 1
74 | HG00235 2
75 | HG00236 2
76 | HG00238 2
77 | HG00239 2
78 | HG00240 2
79 | HG00242 1
80 | HG00243 1
81 | HG00244 1
82 | HG00245 2
83 | HG00246 1
84 | HG00250 2
85 | HG00251 1
86 | HG00252 1
87 | HG00253 2
88 | HG00255 2
89 | HG00256 1
90 | HG00257 2
91 | HG00258 2
92 | HG00259 2
93 | HG00260 1
94 | HG00261 2
95 | HG00262 2
96 | HG00263 2
97 | HG00264 1
98 | HG00265 1
99 | HG00266 2
100 | HG00267 1
101 | HG00268 2
102 | HG00269 2
103 | HG00271 1
104 | HG00272 2
105 | HG00273 1
106 | HG00274 2
107 | HG00275 2
108 | HG00276 2
109 | HG00277 1
110 | HG00278 1
111 | HG00280 1
112 | HG00281 2
113 | HG00282 2
114 | HG00284 1
115 | HG00285 2
116 | HG00306 2
117 | HG00308 1
118 | HG00309 2
119 | HG00310 1
120 | HG00311 1
121 | HG00313 2
122 | HG00315 2
123 | HG00319 2
124 | HG00320 2
125 | HG00321 1
126 | HG00323 2
127 | HG00324 2
128 | HG00325 1
129 | HG00326 2
130 | HG00327 2
131 | HG00328 2
132 | HG00329 1
133 | HG00330 2
134 | HG00331 2
135 | HG00332 2
136 | HG00334 2
137 | HG00335 1
138 | HG00336 1
139 | HG00337 2
140 | HG00338 1
141 | HG00339 2
142 | HG00341 1
143 | HG00342 1
144 | HG00343 2
145 | HG00344 2
146 | HG00345 1
147 | HG00346 2
148 | HG00349 2
149 | HG00350 2
150 | HG00351 1
151 | HG00353 2
152 | HG00355 2
153 | HG00356 2
154 | HG00358 1
155 | HG00360 1
156 | HG00361 2
157 | HG00362 2
158 | HG00364 2
159 | HG00365 2
160 | HG00366 1
161 | HG00367 2
162 | HG00369 1
163 | HG00371 1
164 | HG00372 1
165 | HG00373 2
166 | HG00375 1
167 | HG00376 2
168 | HG00378 2
169 | HG00379 2
170 | HG00380 2
171 | HG00381 2
172 | HG00382 1
173 | HG00383 2
174 | HG00384 2
175 | HG01334 1
176 | HG01789 1
177 | HG01790 2
178 | HG01791 1
179 | HG02215 2
180 | NA06984 1
181 | NA06985 2
182 | NA06986 1
183 | NA06989 2
184 | NA06994 1
185 | NA07037 2
186 | NA07048 1
187 | NA07051 1
188 | NA07056 2
189 | NA07347 1
190 | NA07357 1
191 | NA10847 2
192 | NA10851 1
193 | NA11829 1
194 | NA11830 2
195 | NA11831 1
196 | NA11832 2
197 | NA11840 2
198 | NA11843 1
199 | NA11881 1
200 | NA11892 2
201 | NA11893 1
202 | NA11894 2
203 | NA11918 2
204 | NA11920 2
205 | NA11930 1
206 | NA11931 2
207 | NA11992 1
208 | NA11994 1
209 | NA11995 2
210 | NA12004 2
211 | NA12005 1
212 | NA12006 2
213 | NA12043 1
214 | NA12044 2
215 | NA12045 1
216 | NA12058 2
217 | NA12144 1
218 | NA12154 1
219 | NA12155 1
220 | NA12156 2
221 | NA12234 2
222 | NA12249 2
223 | NA12272 1
224 | NA12273 2
225 | NA12275 2
226 | NA12282 1
227 | NA12283 2
228 | NA12286 1
229 | NA12287 2
230 | NA12340 1
231 | NA12341 2
232 | NA12342 1
233 | NA12347 1
234 | NA12348 2
235 | NA12383 2
236 | NA12399 1
237 | NA12400 2
238 | NA12413 1
239 | NA12489 2
240 | NA12546 1
241 | NA12716 1
242 | NA12717 2
243 | NA12718 2
244 | NA12749 2
245 | NA12750 1
246 | NA12751 2
247 | NA12760 1
248 | NA12761 2
249 | NA12762 1
250 | NA12763 2
251 | NA12775 1
252 | NA12776 2
253 | NA12777 1
254 | NA12778 2
255 | NA12812 1
256 | NA12813 2
257 | NA12814 1
258 | NA12815 2
259 | NA12827 1
260 | NA12829 1
261 | NA12830 2
262 | NA12842 1
263 | NA12843 2
264 | NA12872 1
265 | NA12873 2
266 | NA12874 1
267 | NA12889 1
268 | NA12890 2
269 | NA18486 1
270 | NA18488 2
271 | NA18489 2
272 | NA18498 1
273 | NA18499 2
274 | NA18502 2
275 | NA18505 2
276 | NA18508 2
277 | NA18510 1
278 | NA18511 2
279 | NA18517 2
280 | NA18519 1
281 | NA18520 2
282 | NA18858 2
283 | NA18861 2
284 | NA18867 2
285 | NA18868 1
286 | NA18870 2
287 | NA18873 2
288 | NA18907 2
289 | NA18908 1
290 | NA18909 2
291 | NA18910 1
292 | NA18912 2
293 | NA18916 2
294 | NA18917 1
295 | NA18923 1
296 | NA18933 2
297 | NA18934 1
298 | NA19092 1
299 | NA19093 2
300 | NA19095 2
301 | NA19096 1
302 | NA19098 1
303 | NA19099 2
304 | NA19102 2
305 | NA19107 1
306 | NA19108 2
307 | NA19113 1
308 | NA19114 2
309 | NA19116 2
310 | NA19117 1
311 | NA19118 2
312 | NA19119 1
313 | NA19121 1
314 | NA19129 2
315 | NA19130 1
316 | NA19131 2
317 | NA19137 2
318 | NA19138 1
319 | NA19141 1
320 | NA19143 2
321 | NA19144 1
322 | NA19146 1
323 | NA19147 2
324 | NA19149 2
325 | NA19152 2
326 | NA19153 1
327 | NA19159 2
328 | NA19160 1
329 | NA19171 1
330 | NA19172 2
331 | NA19175 1
332 | NA19184 1
333 | NA19185 2
334 | NA19189 1
335 | NA19190 2
336 | NA19197 2
337 | NA19198 1
338 | NA19200 1
339 | NA19201 2
340 | NA19204 2
341 | NA19206 2
342 | NA19207 1
343 | NA19209 2
344 | NA19210 1
345 | NA19213 1
346 | NA19214 2
347 | NA19222 2
348 | NA19223 1
349 | NA19225 2
350 | NA19235 2
351 | NA19236 1
352 | NA19247 2
353 | NA19248 1
354 | NA19256 1
355 | NA19257 2
356 | NA20502 2
357 | NA20503 2
358 | NA20504 2
359 | NA20505 2
360 | NA20506 2
361 | NA20507 2
362 | NA20508 2
363 | NA20509 1
364 | NA20510 1
365 | NA20512 1
366 | NA20513 1
367 | NA20514 2
368 | NA20515 1
369 | NA20516 1
370 | NA20517 2
371 | NA20518 1
372 | NA20519 1
373 | NA20520 1
374 | NA20521 1
375 | NA20524 1
376 | NA20525 1
377 | NA20527 1
378 | NA20528 1
379 | NA20529 2
380 | NA20530 2
381 | NA20531 2
382 | NA20532 1
383 | NA20534 1
384 | NA20535 2
385 | NA20536 1
386 | NA20538 1
387 | NA20539 1
388 | NA20540 2
389 | NA20541 2
390 | NA20542 2
391 | NA20543 1
392 | NA20544 1
393 | NA20581 1
394 | NA20582 2
395 | NA20585 2
396 | NA20586 1
397 | NA20588 1
398 | NA20589 2
399 | NA20752 1
400 | NA20754 1
401 | NA20756 2
402 | NA20757 2
403 | NA20758 1
404 | NA20759 1
405 | NA20760 2
406 | NA20761 2
407 | NA20765 1
408 | NA20766 2
409 | NA20768 2
410 | NA20769 2
411 | NA20770 1
412 | NA20771 2
413 | NA20772 2
414 | NA20773 2
415 | NA20774 2
416 | NA20778 1
417 | NA20783 1
418 | NA20785 1
419 | NA20786 2
420 | NA20787 1
421 | NA20790 2
422 | NA20792 1
423 | NA20795 2
424 | NA20796 1
425 | NA20797 2
426 | NA20798 1
427 | NA20799 2
428 | NA20800 2
429 | NA20801 1
430 | NA20802 2
431 | NA20803 1
432 | NA20804 2
433 | NA20805 1
434 | NA20806 1
435 | NA20807 2
436 | NA20808 2
437 | NA20809 1
438 | NA20810 1
439 | NA20811 1
440 | NA20812 1
441 | NA20813 2
442 | NA20814 1
443 | NA20815 1
444 | NA20819 2
445 | NA20826 2
446 | NA20828 2
447 |
--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.expression.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.expression.bed.gz
--------------------------------------------------------------------------------
/example/tensorqtl_examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### *cis*- and *trans*-QTL mapping with tensorQTL\n",
8 | "\n",
9 | "This notebook provides examples for running *cis*- and *trans*-QTL mapping with tensorQTL, using open-access data from the [GEUVADIS](https://www.ebi.ac.uk/arrayexpress/experiments/E-GEUV-1/) project.\n",
10 | "\n",
11 | "#### Requirements\n",
12 | "An environment configured with a GPU and ~50GB of memory.\n",
13 | "\n",
14 | "#### Test dataset\n",
15 | "\n",
16 | "*Note: these files are provided for testing/benchmarking purposes only. They do not constitute an official release from the GEUVADIS project, and no quality-control was applied.*\n",
17 | "\n",
18 | "Genotypes in PLINK2 format (chr18 only), and normalized expression data are available [in this repository](./data/); the full dataset is available at [gs://gtex-resources/test_data/geuvadis](https://console.cloud.google.com/storage/browser/gtex-resources/test_data/geuvadis) ([requester pays](https://cloud.google.com/storage/docs/requester-pays))."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "torch: 2.5.1+cu124 (CUDA 12.4), device: cuda\n",
31 | "pandas: 2.2.3\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "import pandas as pd\n",
37 | "import torch\n",
38 | "import tensorqtl\n",
39 | "from tensorqtl import pgen, cis, trans, post\n",
40 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
41 | "print(f\"torch: {torch.__version__} (CUDA {torch.version.cuda}), device: {device}\")\n",
42 | "print(f\"pandas: {pd.__version__}\")\n",
43 | "\n",
44 | "# define paths to data\n",
45 | "plink_prefix_path = 'data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18'\n",
46 | "expression_bed = 'data/GEUVADIS.445_samples.expression.bed.gz'\n",
47 | "covariates_file = 'data/GEUVADIS.445_samples.covariates.txt'\n",
48 | "prefix = 'GEUVADIS.445_samples'\n",
49 | "\n",
50 | "# load phenotypes and covariates\n",
51 | "phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)\n",
52 | "covariates_df = pd.read_csv(covariates_file, sep='\\t', index_col=0).T\n",
53 | "\n",
54 | "# PLINK reader for genotypes\n",
55 | "pgr = pgen.PgenReader(plink_prefix_path)\n",
56 | "genotype_df = pgr.load_genotypes()\n",
57 | "variant_df = pgr.variant_df"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "### *cis*-QTL: nominal p-values for all variant-phenotype pairs"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 2,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "cis-QTL mapping: nominal associations for all variant-phenotype pairs\n",
77 | " * 445 samples\n",
78 | " * 301 phenotypes\n",
79 | " * 26 covariates\n",
80 | " * 367759 variants\n",
81 | " * cis-window: ±1,000,000\n",
82 | " * checking phenotypes: 301/301\n",
83 | " * Computing associations\n",
84 | " Mapping chromosome chr18\n",
85 | " processing phenotype 301/301\n",
86 | " time elapsed: 0.04 min\n",
87 | " * writing output\n",
88 | "done.\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "# map all cis-associations (results for each chromosome are written to file)\n",
94 | "\n",
95 | "# all genes\n",
96 | "# cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix, covariates_df=covariates_df)\n",
97 | "\n",
98 | "# genes on chr18\n",
99 | "cis.map_nominal(genotype_df, variant_df,\n",
100 | " phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
101 | " phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
102 | " prefix, covariates_df=covariates_df)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 3,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/html": [
113 | "
\n",
114 | "\n",
127 | "
\n",
128 | " \n",
129 | " \n",
130 | " | \n",
131 | " phenotype_id | \n",
132 | " variant_id | \n",
133 | " start_distance | \n",
134 | " af | \n",
135 | " ma_samples | \n",
136 | " ma_count | \n",
137 | " pval_nominal | \n",
138 | " slope | \n",
139 | " slope_se | \n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " \n",
144 | " 0 | \n",
145 | " ENSG00000263006.6 | \n",
146 | " chr18_10644_C_G_b38 | \n",
147 | " -98421 | \n",
148 | " 0.016854 | \n",
149 | " 15 | \n",
150 | " 15 | \n",
151 | " 0.580873 | \n",
152 | " -0.117761 | \n",
153 | " 0.213125 | \n",
154 | "
\n",
155 | " \n",
156 | " 1 | \n",
157 | " ENSG00000263006.6 | \n",
158 | " chr18_10847_C_A_b38 | \n",
159 | " -98218 | \n",
160 | " 0.019101 | \n",
161 | " 17 | \n",
162 | " 17 | \n",
163 | " 0.142884 | \n",
164 | " -0.298726 | \n",
165 | " 0.203505 | \n",
166 | "
\n",
167 | " \n",
168 | " 2 | \n",
169 | " ENSG00000263006.6 | \n",
170 | " chr18_11275_G_A_b38 | \n",
171 | " -97790 | \n",
172 | " 0.024719 | \n",
173 | " 22 | \n",
174 | " 22 | \n",
175 | " 0.745231 | \n",
176 | " 0.054619 | \n",
177 | " 0.167981 | \n",
178 | "
\n",
179 | " \n",
180 | " 3 | \n",
181 | " ENSG00000263006.6 | \n",
182 | " chr18_11358_G_A_b38 | \n",
183 | " -97707 | \n",
184 | " 0.024719 | \n",
185 | " 22 | \n",
186 | " 22 | \n",
187 | " 0.745231 | \n",
188 | " 0.054619 | \n",
189 | " 0.167981 | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " ENSG00000263006.6 | \n",
194 | " chr18_11445_G_A_b38 | \n",
195 | " -97620 | \n",
196 | " 0.023596 | \n",
197 | " 21 | \n",
198 | " 21 | \n",
199 | " 0.603276 | \n",
200 | " 0.089378 | \n",
201 | " 0.171851 | \n",
202 | "
\n",
203 | " \n",
204 | "
\n",
205 | "
"
206 | ],
207 | "text/plain": [
208 | " phenotype_id variant_id start_distance af \\\n",
209 | "0 ENSG00000263006.6 chr18_10644_C_G_b38 -98421 0.016854 \n",
210 | "1 ENSG00000263006.6 chr18_10847_C_A_b38 -98218 0.019101 \n",
211 | "2 ENSG00000263006.6 chr18_11275_G_A_b38 -97790 0.024719 \n",
212 | "3 ENSG00000263006.6 chr18_11358_G_A_b38 -97707 0.024719 \n",
213 | "4 ENSG00000263006.6 chr18_11445_G_A_b38 -97620 0.023596 \n",
214 | "\n",
215 | " ma_samples ma_count pval_nominal slope slope_se \n",
216 | "0 15 15 0.580873 -0.117761 0.213125 \n",
217 | "1 17 17 0.142884 -0.298726 0.203505 \n",
218 | "2 22 22 0.745231 0.054619 0.167981 \n",
219 | "3 22 22 0.745231 0.054619 0.167981 \n",
220 | "4 21 21 0.603276 0.089378 0.171851 "
221 | ]
222 | },
223 | "execution_count": 3,
224 | "metadata": {},
225 | "output_type": "execute_result"
226 | }
227 | ],
228 | "source": [
229 | "# load results\n",
230 | "pairs_df = pd.read_parquet(f'{prefix}.cis_qtl_pairs.chr18.parquet')\n",
231 | "pairs_df.head()"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "### *cis*-QTL: empirical p-values for phenotypes"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 4,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "name": "stdout",
248 | "output_type": "stream",
249 | "text": [
250 | "cis-QTL mapping: empirical p-values for phenotypes\n",
251 | " * 445 samples\n",
252 | " * 301 phenotypes\n",
253 | " * 26 covariates\n",
254 | " * 367759 variants\n",
255 | " * cis-window: ±1,000,000\n",
256 | " * using seed 123456\n",
257 | " * checking phenotypes: 301/301\n",
258 | " * computing permutations\n",
259 | " processing phenotype 301/301\n",
260 | " Time elapsed: 0.31 min\n",
261 | "done.\n",
262 | "Computing q-values\n",
263 | " * Number of phenotypes tested: 301\n",
264 | " * Correlation between Beta-approximated and empirical p-values: 1.0000\n",
265 | " * Calculating q-values with lambda = 0.850\n",
266 | " * Proportion of significant phenotypes (1-pi0): 0.76\n",
267 | " * QTL phenotypes @ FDR 0.05: 205\n",
268 | " * min p-value threshold @ FDR 0.05: 0.135284\n"
269 | ]
270 | }
271 | ],
272 | "source": [
273 | "# all genes\n",
274 | "# cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df)\n",
275 | "\n",
276 | "# genes on chr18\n",
277 | "cis_df = cis.map_cis(genotype_df, variant_df, \n",
278 | " phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
279 | " phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
280 | " covariates_df=covariates_df, seed=123456)\n",
281 | "# compute q-values (in practice, this must be run on all genes, not a subset)\n",
282 | "post.calculate_qvalues(cis_df, fdr=0.05, qvalue_lambda=0.85)"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 5,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "data": {
292 | "text/html": [
293 | "\n",
294 | "\n",
307 | "
\n",
308 | " \n",
309 | " \n",
310 | " | \n",
311 | " num_var | \n",
312 | " beta_shape1 | \n",
313 | " beta_shape2 | \n",
314 | " true_df | \n",
315 | " pval_true_df | \n",
316 | " variant_id | \n",
317 | " start_distance | \n",
318 | " end_distance | \n",
319 | " ma_samples | \n",
320 | " ma_count | \n",
321 | " af | \n",
322 | " pval_nominal | \n",
323 | " slope | \n",
324 | " slope_se | \n",
325 | " pval_perm | \n",
326 | " pval_beta | \n",
327 | " qval | \n",
328 | " pval_nominal_threshold | \n",
329 | "
\n",
330 | " \n",
331 | " phenotype_id | \n",
332 | " | \n",
333 | " | \n",
334 | " | \n",
335 | " | \n",
336 | " | \n",
337 | " | \n",
338 | " | \n",
339 | " | \n",
340 | " | \n",
341 | " | \n",
342 | " | \n",
343 | " | \n",
344 | " | \n",
345 | " | \n",
346 | " | \n",
347 | " | \n",
348 | " | \n",
349 | " | \n",
350 | "
\n",
351 | " \n",
352 | " \n",
353 | " \n",
354 | " ENSG00000263006.6 | \n",
355 | " 6120 | \n",
356 | " 1.038811 | \n",
357 | " 1138.434082 | \n",
358 | " 374.660400 | \n",
359 | " 8.220950e-40 | \n",
360 | " chr18_112535_G_A_b38 | \n",
361 | " 3470 | \n",
362 | " 3470 | \n",
363 | " 212 | \n",
364 | " 251 | \n",
365 | " 0.282022 | \n",
366 | " 4.050344e-44 | \n",
367 | " 0.726425 | \n",
368 | " 0.046171 | \n",
369 | " 0.000100 | \n",
370 | " 3.677735e-38 | \n",
371 | " 2.697006e-37 | \n",
372 | " 0.000141 | \n",
373 | "
\n",
374 | " \n",
375 | " ENSG00000101557.14 | \n",
376 | " 6355 | \n",
377 | " 1.032237 | \n",
378 | " 1076.303223 | \n",
379 | " 370.176422 | \n",
380 | " 5.632806e-11 | \n",
381 | " chr18_210698_T_C_b38 | \n",
382 | " 52315 | \n",
383 | " 52315 | \n",
384 | " 192 | \n",
385 | " 222 | \n",
386 | " 0.249438 | \n",
387 | " 3.505411e-12 | \n",
388 | " -0.191712 | \n",
389 | " 0.026749 | \n",
390 | " 0.000100 | \n",
391 | " 3.498951e-08 | \n",
392 | " 3.563747e-08 | \n",
393 | " 0.000146 | \n",
394 | "
\n",
395 | " \n",
396 | " ENSG00000079134.11 | \n",
397 | " 6921 | \n",
398 | " 1.047219 | \n",
399 | " 1155.660156 | \n",
400 | " 370.356049 | \n",
401 | " 3.888738e-08 | \n",
402 | " chr18_243547_T_A_b38 | \n",
403 | " -24503 | \n",
404 | " -24503 | \n",
405 | " 293 | \n",
406 | " 383 | \n",
407 | " 0.430337 | \n",
408 | " 5.473709e-09 | \n",
409 | " -0.122720 | \n",
410 | " 0.020602 | \n",
411 | " 0.000100 | \n",
412 | " 2.743975e-05 | \n",
413 | " 1.916427e-05 | \n",
414 | " 0.000141 | \n",
415 | "
\n",
416 | " \n",
417 | " ENSG00000263884.1 | \n",
418 | " 6921 | \n",
419 | " 1.039806 | \n",
420 | " 1152.501587 | \n",
421 | " 369.873505 | \n",
422 | " 7.681884e-04 | \n",
423 | " chr18_584440_G_C_b38 | \n",
424 | " 316292 | \n",
425 | " 316292 | \n",
426 | " 81 | \n",
427 | " 88 | \n",
428 | " 0.098876 | \n",
429 | " 3.540399e-04 | \n",
430 | " -0.330811 | \n",
431 | " 0.091845 | \n",
432 | " 0.574843 | \n",
433 | " 5.695498e-01 | \n",
434 | " 1.577698e-01 | \n",
435 | " 0.000139 | \n",
436 | "
\n",
437 | " \n",
438 | " ENSG00000158270.11 | \n",
439 | " 8134 | \n",
440 | " 1.054919 | \n",
441 | " 1277.927246 | \n",
442 | " 369.469086 | \n",
443 | " 2.516529e-09 | \n",
444 | " chr18_519222_C_T_b38 | \n",
445 | " 18500 | \n",
446 | " 18500 | \n",
447 | " 108 | \n",
448 | " 115 | \n",
449 | " 0.129213 | \n",
450 | " 2.409717e-10 | \n",
451 | " -0.388277 | \n",
452 | " 0.059808 | \n",
453 | " 0.000100 | \n",
454 | " 1.567348e-06 | \n",
455 | " 1.321136e-06 | \n",
456 | " 0.000130 | \n",
457 | "
\n",
458 | " \n",
459 | "
\n",
460 | "
"
461 | ],
462 | "text/plain": [
463 | " num_var beta_shape1 beta_shape2 true_df \\\n",
464 | "phenotype_id \n",
465 | "ENSG00000263006.6 6120 1.038811 1138.434082 374.660400 \n",
466 | "ENSG00000101557.14 6355 1.032237 1076.303223 370.176422 \n",
467 | "ENSG00000079134.11 6921 1.047219 1155.660156 370.356049 \n",
468 | "ENSG00000263884.1 6921 1.039806 1152.501587 369.873505 \n",
469 | "ENSG00000158270.11 8134 1.054919 1277.927246 369.469086 \n",
470 | "\n",
471 | " pval_true_df variant_id start_distance \\\n",
472 | "phenotype_id \n",
473 | "ENSG00000263006.6 8.220950e-40 chr18_112535_G_A_b38 3470 \n",
474 | "ENSG00000101557.14 5.632806e-11 chr18_210698_T_C_b38 52315 \n",
475 | "ENSG00000079134.11 3.888738e-08 chr18_243547_T_A_b38 -24503 \n",
476 | "ENSG00000263884.1 7.681884e-04 chr18_584440_G_C_b38 316292 \n",
477 | "ENSG00000158270.11 2.516529e-09 chr18_519222_C_T_b38 18500 \n",
478 | "\n",
479 | " end_distance ma_samples ma_count af \\\n",
480 | "phenotype_id \n",
481 | "ENSG00000263006.6 3470 212 251 0.282022 \n",
482 | "ENSG00000101557.14 52315 192 222 0.249438 \n",
483 | "ENSG00000079134.11 -24503 293 383 0.430337 \n",
484 | "ENSG00000263884.1 316292 81 88 0.098876 \n",
485 | "ENSG00000158270.11 18500 108 115 0.129213 \n",
486 | "\n",
487 | " pval_nominal slope slope_se pval_perm pval_beta \\\n",
488 | "phenotype_id \n",
489 | "ENSG00000263006.6 4.050344e-44 0.726425 0.046171 0.000100 3.677735e-38 \n",
490 | "ENSG00000101557.14 3.505411e-12 -0.191712 0.026749 0.000100 3.498951e-08 \n",
491 | "ENSG00000079134.11 5.473709e-09 -0.122720 0.020602 0.000100 2.743975e-05 \n",
492 | "ENSG00000263884.1 3.540399e-04 -0.330811 0.091845 0.574843 5.695498e-01 \n",
493 | "ENSG00000158270.11 2.409717e-10 -0.388277 0.059808 0.000100 1.567348e-06 \n",
494 | "\n",
495 | " qval pval_nominal_threshold \n",
496 | "phenotype_id \n",
497 | "ENSG00000263006.6 2.697006e-37 0.000141 \n",
498 | "ENSG00000101557.14 3.563747e-08 0.000146 \n",
499 | "ENSG00000079134.11 1.916427e-05 0.000141 \n",
500 | "ENSG00000263884.1 1.577698e-01 0.000139 \n",
501 | "ENSG00000158270.11 1.321136e-06 0.000130 "
502 | ]
503 | },
504 | "execution_count": 5,
505 | "metadata": {},
506 | "output_type": "execute_result"
507 | }
508 | ],
509 | "source": [
510 | "cis_df.head()"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "### *trans*-QTL mapping"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 6,
523 | "metadata": {},
524 | "outputs": [
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "trans-QTL mapping\n",
530 | " * 445 samples\n",
531 | " * 19836 phenotypes\n",
532 | " * 26 covariates\n",
533 | " * 367759 variants\n",
534 | " processing batch 37/37\n",
535 | " elapsed time: 0.02 min\n",
536 | " * 210838 variants passed MAF >= 0.05 filtering\n",
537 | "done.\n"
538 | ]
539 | }
540 | ],
541 | "source": [
542 | "# run mapping\n",
543 | "# to limit output size, only associations with p-value <= 1e-5 are returned\n",
544 | "trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, batch_size=10000,\n",
545 | " return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05)"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 7,
551 | "metadata": {},
552 | "outputs": [],
553 | "source": [
554 | "# remove cis-associations\n",
555 | "trans_df = trans.filter_cis(trans_df, phenotype_pos_df, variant_df, window=5000000)"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 8,
561 | "metadata": {},
562 | "outputs": [
563 | {
564 | "data": {
565 | "text/html": [
566 | "\n",
567 | "\n",
580 | "
\n",
581 | " \n",
582 | " \n",
583 | " | \n",
584 | " variant_id | \n",
585 | " phenotype_id | \n",
586 | " pval | \n",
587 | " b | \n",
588 | " b_se | \n",
589 | " af | \n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " \n",
594 | " 1 | \n",
595 | " chr18_20683_A_G_b38 | \n",
596 | " ENSG00000163900.10 | \n",
597 | " 5.012229e-06 | \n",
598 | " 0.209540 | \n",
599 | " 0.045309 | \n",
600 | " 0.179775 | \n",
601 | "
\n",
602 | " \n",
603 | " 3 | \n",
604 | " chr18_27346_G_T_b38 | \n",
605 | " ENSG00000164088.17 | \n",
606 | " 7.309937e-06 | \n",
607 | " -0.265623 | \n",
608 | " 0.058483 | \n",
609 | " 0.123596 | \n",
610 | "
\n",
611 | " \n",
612 | " 11 | \n",
613 | " chr18_43564_G_A_b38 | \n",
614 | " ENSG00000198162.12 | \n",
615 | " 1.314060e-07 | \n",
616 | " -0.202922 | \n",
617 | " 0.037792 | \n",
618 | " 0.093258 | \n",
619 | "
\n",
620 | " \n",
621 | " 12 | \n",
622 | " chr18_43564_G_A_b38 | \n",
623 | " ENSG00000261098.1 | \n",
624 | " 8.494569e-06 | \n",
625 | " -0.421968 | \n",
626 | " 0.093594 | \n",
627 | " 0.093258 | \n",
628 | "
\n",
629 | " \n",
630 | " 13 | \n",
631 | " chr18_43611_C_T_b38 | \n",
632 | " ENSG00000265972.5 | \n",
633 | " 1.448981e-06 | \n",
634 | " -0.272301 | \n",
635 | " 0.055697 | \n",
636 | " 0.135955 | \n",
637 | "
\n",
638 | " \n",
639 | "
\n",
640 | "
"
641 | ],
642 | "text/plain": [
643 | " variant_id phenotype_id pval b b_se \\\n",
644 | "1 chr18_20683_A_G_b38 ENSG00000163900.10 5.012229e-06 0.209540 0.045309 \n",
645 | "3 chr18_27346_G_T_b38 ENSG00000164088.17 7.309937e-06 -0.265623 0.058483 \n",
646 | "11 chr18_43564_G_A_b38 ENSG00000198162.12 1.314060e-07 -0.202922 0.037792 \n",
647 | "12 chr18_43564_G_A_b38 ENSG00000261098.1 8.494569e-06 -0.421968 0.093594 \n",
648 | "13 chr18_43611_C_T_b38 ENSG00000265972.5 1.448981e-06 -0.272301 0.055697 \n",
649 | "\n",
650 | " af \n",
651 | "1 0.179775 \n",
652 | "3 0.123596 \n",
653 | "11 0.093258 \n",
654 | "12 0.093258 \n",
655 | "13 0.135955 "
656 | ]
657 | },
658 | "execution_count": 8,
659 | "metadata": {},
660 | "output_type": "execute_result"
661 | }
662 | ],
663 | "source": [
664 | "trans_df.head()"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": null,
670 | "metadata": {},
671 | "outputs": [],
672 | "source": []
673 | }
674 | ],
675 | "metadata": {
676 | "kernelspec": {
677 | "display_name": "Python 3 (ipykernel)",
678 | "language": "python",
679 | "name": "python3"
680 | },
681 | "language_info": {
682 | "codemirror_mode": {
683 | "name": "ipython",
684 | "version": 3
685 | },
686 | "file_extension": ".py",
687 | "mimetype": "text/x-python",
688 | "name": "python",
689 | "nbconvert_exporter": "python",
690 | "pygments_lexer": "ipython3",
691 | "version": "3.11.9"
692 | }
693 | },
694 | "nbformat": 4,
695 | "nbformat_minor": 4
696 | }
697 |
--------------------------------------------------------------------------------
/install/INSTALL.md:
--------------------------------------------------------------------------------
1 | ### Setup CUDA drivers and PyTorch on GCP
2 |
3 | Launch a new instance configured with Ubuntu 22.04 LTS and a GPU, clone this repository, and run the following:
4 | #### Install CUDA
5 | ```bash
6 | sudo ./install_cuda.sh
7 | sudo reboot
8 | # verify
9 | nvidia-smi
10 | ```
11 |
12 | #### Install R
13 | Required for computing q-values. Follow instructions [here](https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-22-04), then install the 'qvalue' package with
14 | ```bash
15 | if (!require("BiocManager", quietly = TRUE))
16 | install.packages("BiocManager")
17 | BiocManager::install("qvalue")
18 | ```
19 |
20 | #### Install Python 3
21 | Using a [conda](https://github.com/conda-forge/miniforge) environment is recommended. The `tensorqtl_env.yml` configuration contains all required packages, including `torch` and `tensorqtl`.
22 | ```bash
23 | mamba env create -f tensorqtl_env.yml
24 | conda activate tensorqtl
25 |
26 | # verify
27 | python -c "import torch; print(torch.__version__); print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))"
28 |
29 | # this should print something like
30 | # 2.1.2+cu121
31 | # CUDA available: True (Tesla P100-PCIE-16GB)
32 | ```
33 |
34 | #### Install rmate (optional)
35 | ```bash
36 | sudo apt install -y ruby
37 | mkdir ~/bin
38 | curl -Lo ~/bin/rmate https://raw.githubusercontent.com/textmate/rmate/master/bin/rmate
39 | chmod a+x ~/bin/rmate
40 | echo 'export RMATE_PORT=${rmate_port}' >> ~/.bashrc
41 | ```
42 |
--------------------------------------------------------------------------------
/install/install_cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install script for PyTorch 2.1.2 + CUDA 12.1 on Ubuntu 22.04
3 | # for torch, see https://pytorch.org/get-started/locally/
4 | # for CUDA drivers, see https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local
5 | # for other versions, see https://developer.nvidia.com/cuda-toolkit-archive
6 |
7 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
8 | sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
9 | wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
10 | sudo dpkg -i cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
11 | sudo cp /var/cuda-repo-ubuntu2204-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
12 | sudo apt-get update
13 | sudo apt-get -y install cuda
14 | rm cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
15 |
16 | # test
17 | python -c "import torch; print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))"
18 |
--------------------------------------------------------------------------------
/install/tensorqtl_env.yml:
--------------------------------------------------------------------------------
1 | name: tensorqtl
2 | dependencies:
3 | - python=3.11
4 | - pip
5 | - pip:
6 | - numpy
7 | - pandas
8 | - pandas-plink
9 | - Pgenlib>=0.90.1
10 | - pyarrow
11 | - qtl
12 | - rpy2
13 | - scipy
14 | - torch
15 | - tensorqtl
16 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.setuptools]
6 | packages = ["tensorqtl"]
7 |
8 | [project]
9 | name = "tensorqtl"
10 | version = "1.0.10"
11 | dependencies = [
12 | "numpy",
13 | "pandas",
14 | "Pgenlib>=0.90.1",
15 | "qtl",
16 | "scipy",
17 | "torch",
18 | ]
19 | authors = [
20 | {name = "Francois Aguet", email = "francois@broadinstitute.org"}
21 | ]
22 | maintainers = [
23 | {name = "Francois Aguet", email = "francois@broadinstitute.org"}
24 | ]
25 | description = "GPU-accelerated QTL mapper"
26 | readme = "README.md"
27 | license = {file = "LICENSE"}
28 | keywords = ["Quantitative trait loci"]
29 | classifiers = [
30 | "Development Status :: 4 - Beta",
31 | "Programming Language :: Python :: 3",
32 | "Intended Audience :: Science/Research",
33 | "Topic :: Scientific/Engineering :: Bio-Informatics",
34 | ]
35 |
36 | [project.urls]
37 | Repository = "https://github.com/broadinstitute/tensorqtl.git"
38 |
--------------------------------------------------------------------------------
/tensorqtl/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | from .tensorqtl import *
3 |
4 | __version__ = importlib.metadata.version(__name__)
5 |
--------------------------------------------------------------------------------
/tensorqtl/__main__.py:
--------------------------------------------------------------------------------
1 | import tensorqtl
2 | tensorqtl.main()
3 |
--------------------------------------------------------------------------------
/tensorqtl/coloc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import scipy.stats as stats
4 | import torch
5 | import os
6 | import time
7 | import sys
8 | sys.path.insert(1, os.path.dirname(__file__))
9 | import genotypeio, eigenmt
10 | from core import *
11 |
12 |
13 | def logsumexp(x, dim=0):
14 | mmax,_ = torch.max(x, dim=dim, keepdim=True)
15 | return mmax + (x-mmax).exp().sum(dim, keepdim=True).log()
16 |
17 |
18 | def logdiff(x, y, dim=0):
19 | xmax,_ = torch.max(x, dim=dim, keepdim=True)
20 | ymax,_ = torch.max(y, dim=dim, keepdim=True)
21 | mmax = torch.max(xmax, ymax)
22 | return mmax + ((x - mmax).exp() - (y - mmax).exp()).log()
23 |
24 |
25 | def coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t,
26 | residualizer1=None, residualizer2=None, mode='beta',
27 | p1=1e-4, p2=1e-4, p12=1e-5):
28 | """COLOC from summary statistics (either beta/sds or p-values and MAF)"""
29 |
30 | assert phenotype1_t.dim() == 1
31 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32 |
33 | # phenotype 1
34 | if mode == 'beta':
35 | r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr(
36 | genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=True)
37 | r_nominal_t = r_nominal_t.squeeze()
38 | var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1)
39 | else:
40 | r_nominal_t = calculate_corr(
41 | genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=False).squeeze()
42 | r2_nominal_t = r_nominal_t.double().pow(2)
43 |
44 | if residualizer1 is not None:
45 | dof = residualizer1.dof
46 | else:
47 | dof = phenotype1_t.shape[0] - 2
48 |
49 | if mode == 'beta':
50 | tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t)
51 | beta2_t = r2_nominal_t * var_ratio_t.squeeze()
52 | beta_var_t = beta2_t / tstat2_t
53 | var_prior = 0.0225 * phenotype_var_t
54 | r = var_prior / (var_prior + beta_var_t)
55 | l1 = 0.5 * ((1 - r).log() + r*tstat2_t)
56 | else:
57 | # compute p-values and z-score to match COLOC results exactly
58 | # (instead of directly using t-statistic)
59 | tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t))
60 | p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof) # 2 dropped since canceled in isf
61 | maf_t = calculate_maf(genotypes1_t)
62 | N = phenotype1_t.shape[0]
63 | v = 1 / (2 * N * maf_t * (1 - maf_t))
64 | z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device)
65 | r = 0.0225 / (0.0225 + v)
66 | l1 = 0.5 * ((1 - r).log() + r*z2_t)
67 |
68 | # phenotype 2
69 | if phenotype2_t.dim() == 1:
70 | num_phenotypes = 1
71 | num_samples = phenotype2_t.shape[0]
72 | phenotype2_t = phenotype2_t.reshape(1,-1)
73 | else:
74 | num_phenotypes, num_samples = phenotype2_t.shape
75 |
76 | if mode == 'beta':
77 | r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr(
78 | genotypes2_t, phenotype2_t, residualizer2, return_var=True)
79 | r_nominal_t = r_nominal_t.squeeze()
80 | var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1)
81 | else:
82 | r_nominal_t = calculate_corr(genotypes2_t, phenotype2_t, residualizer2, return_var=False).squeeze()
83 | r2_nominal_t = r_nominal_t.double().pow(2)
84 |
85 | if residualizer2 is not None:
86 | dof = residualizer2.dof
87 | else:
88 | dof = num_samples - 2
89 |
90 | if mode == 'beta':
91 | tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t)
92 | beta2_t = r2_nominal_t * var_ratio_t.squeeze()
93 | beta_var_t = beta2_t / tstat2_t
94 | var_prior = 0.0225 * phenotype_var_t
95 | r = var_prior / (var_prior + beta_var_t)
96 | l2 = 0.5 * ((1 - r).log() + r*tstat2_t)
97 | else:
98 | tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t))
99 | p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof)
100 | maf_t = calculate_maf(genotypes2_t)
101 | v = 1 / (2 * num_samples * maf_t * (1 - maf_t))
102 | z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device)
103 | r = 0.0225 / (0.0225 + v)
104 | if num_phenotypes > 1:
105 | r = r.reshape(-1,1)
106 | l2 = 0.5 * ((1 - r).log() + r*z2_t)
107 |
108 | if num_phenotypes > 1:
109 | lsum = l1.reshape(-1,1) + l2
110 | lh0_abf = torch.zeros([1, num_phenotypes]).to(device)
111 | lh1_abf = np.log(p1) + logsumexp(l1).repeat([1, num_phenotypes])
112 | else:
113 | lsum = l1 + l2
114 | lh0_abf = torch.zeros([1]).to(device)
115 | lh1_abf = np.log(p1) + logsumexp(l1)
116 | lh2_abf = np.log(p2) + logsumexp(l2)
117 | lh3_abf = np.log(p1) + np.log(p2) + logdiff(logsumexp(l1) + logsumexp(l2), logsumexp(lsum))
118 | lh4_abf = np.log(p12) + logsumexp(lsum)
119 | all_abf = torch.cat([lh0_abf, lh1_abf, lh2_abf, lh3_abf, lh4_abf])
120 | return (all_abf - logsumexp(all_abf, dim=0)).exp().squeeze()
121 |
122 |
123 | def run_pairs(genotype_df, variant_df, phenotype1_df, phenotype2_df, phenotype_pos_df,
124 | covariates1_df=None, covariates2_df=None, p1=1e-4, p2=1e-4, p12=1e-5, mode='beta',
125 | maf_threshold=0, window=1000000, batch_size=10000, logger=None, verbose=True):
126 | """Compute COLOC for all phenotype pairs"""
127 |
128 | assert np.all(phenotype1_df.index == phenotype2_df.index)
129 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
130 |
131 | if logger is None:
132 | logger = SimpleLogger()
133 |
134 | logger.write('Computing COLOC for all pairs of phenotypes')
135 | logger.write(f' * {phenotype1_df.shape[0]} phenotypes')
136 | logger.write(f' * phenotype group 1: {phenotype1_df.shape[1]} samples')
137 | logger.write(f' * phenotype group 2: {phenotype2_df.shape[1]} samples')
138 |
139 | if covariates1_df is not None:
140 | assert np.all(phenotype1_df.columns == covariates1_df.index)
141 | logger.write(f' * phenotype group 1: {covariates1_df.shape[1]} covariates')
142 | residualizer1 = Residualizer(torch.tensor(covariates1_df.values, dtype=torch.float32).to(device))
143 | else:
144 | residualizer1 = None
145 |
146 | if covariates2_df is not None:
147 | assert np.all(phenotype2_df.columns == covariates2_df.index)
148 | logger.write(f' * phenotype group 2: {covariates2_df.shape[1]} covariates')
149 | residualizer2 = Residualizer(torch.tensor(covariates2_df.values, dtype=torch.float32).to(device))
150 | else:
151 | residualizer2 = None
152 |
153 | if maf_threshold > 0:
154 | logger.write(f' * applying in-sample {maf_threshold} MAF filter (in at least one cohort)')
155 |
156 | genotype1_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype1_df.columns])
157 | genotype1_ix_t = torch.from_numpy(genotype1_ix).to(device)
158 | genotype2_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype2_df.columns])
159 | genotype2_ix_t = torch.from_numpy(genotype2_ix).to(device)
160 |
161 | igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype1_df, phenotype_pos_df, window=window)
162 | coloc_df = []
163 | start_time = time.time()
164 | logger.write(' * Computing pairwise colocalization')
165 | for phenotype1, genotypes, genotype_range, phenotype_id in igc.generate_data(verbose=verbose):
166 | phenotype2 = phenotype2_df.loc[phenotype_id]
167 |
168 | # copy to GPU
169 | phenotype1_t = torch.tensor(phenotype1, dtype=torch.float).to(device)
170 | phenotype2_t = torch.tensor(phenotype2, dtype=torch.float).to(device)
171 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
172 | genotypes1_t = genotypes_t[:,genotype1_ix_t]
173 | genotypes2_t = genotypes_t[:,genotype2_ix_t]
174 | del genotypes_t
175 |
176 | impute_mean(genotypes1_t)
177 | impute_mean(genotypes2_t)
178 | # filter monomorphic sites
179 | m = ((genotypes1_t==0).all(1) | (genotypes1_t==1).all(1) | (genotypes1_t==2).all(1) |
180 | (genotypes2_t==0).all(1) | (genotypes2_t==1).all(1) | (genotypes2_t==2).all(1))
181 | genotypes1_t = genotypes1_t[~m]
182 | genotypes2_t = genotypes2_t[~m]
183 |
184 | if maf_threshold > 0:
185 | maf1_t = calculate_maf(genotypes1_t)
186 | maf2_t = calculate_maf(genotypes2_t)
187 | mask_t = (maf1_t >= maf_threshold) | (maf2_t >= maf_threshold)
188 | genotypes1_t = genotypes1_t[mask_t]
189 | genotypes2_t = genotypes2_t[mask_t]
190 |
191 | coloc_t = coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t,
192 | residualizer1=residualizer1, residualizer2=residualizer2,
193 | p1=p1, p2=p2, p12=p12, mode=mode)
194 | coloc_df.append(coloc_t.cpu().numpy())
195 | logger.write(' time elapsed: {:.2f} min'.format((time.time()-start_time)/60))
196 | coloc_df = pd.DataFrame(coloc_df, columns=[f'pp_h{i}_abf' for i in range(5)], index=phenotype1_df.index)
197 | logger.write('done.')
198 | return coloc_df
199 |
--------------------------------------------------------------------------------
/tensorqtl/core.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import scipy.stats as stats
5 | import scipy.optimize
6 | from scipy.special import loggamma
7 | import sys
8 | import re
9 | import subprocess
10 |
11 | # check R
12 | has_rpy2 = False
13 | try:
14 | subprocess.check_call('which R', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
15 | subprocess.check_call("R -e 'library(qvalue)'", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
16 | import rpy2
17 | import rfunc
18 | has_rpy2 = True
19 | except:
20 | print("Warning: 'rfunc' cannot be imported. R with the 'qvalue' library and the 'rpy2' Python package are needed to compute q-values.")
21 |
22 |
23 | output_dtype_dict = {
24 | 'num_var':np.int32,
25 | 'beta_shape1':np.float32,
26 | 'beta_shape2':np.float32,
27 | 'true_df':np.float32,
28 | 'pval_true_df':np.float64,
29 | 'variant_id':str,
30 | 'start_distance':np.int32,
31 | 'end_distance':np.int32,
32 | 'ma_samples':np.int32,
33 | 'ma_count':np.int32,
34 | 'af':np.float32,
35 | 'pval_nominal':np.float64,
36 | 'slope':np.float32,
37 | 'slope_se':np.float32,
38 | 'pval_perm':np.float64,
39 | 'pval_beta':np.float64,
40 | }
41 |
42 |
43 | class SimpleLogger(object):
44 | def __init__(self, logfile=None, verbose=True):
45 | self.console = sys.stdout
46 | self.verbose = verbose
47 | if logfile is not None:
48 | self.log = open(logfile, 'w')
49 | else:
50 | self.log = None
51 |
52 | def write(self, message):
53 | if self.verbose:
54 | self.console.write(message+'\n')
55 | if self.log is not None:
56 | self.log.write(message+'\n')
57 | self.log.flush()
58 |
59 | #------------------------------------------------------------------------------
60 | # Core classes/functions for mapping associations on GPU
61 | #------------------------------------------------------------------------------
62 | class Residualizer(object):
63 | def __init__(self, C_t):
64 | # center and orthogonalize
65 | self.Q_t, _ = torch.linalg.qr(C_t - C_t.mean(0))
66 | self.dof = C_t.shape[0] - 2 - C_t.shape[1]
67 |
68 | def transform(self, M_t, center=True):
69 | """Residualize rows of M wrt columns of C"""
70 | M0_t = M_t - M_t.mean(1, keepdim=True)
71 | if center:
72 | M0_t = M0_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t())
73 | else:
74 | M0_t = M_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t())
75 | return M0_t
76 |
77 |
78 | def calculate_maf(genotype_t, alleles=2):
79 | """Calculate minor allele frequency"""
80 | af_t = genotype_t.sum(1) / (alleles * genotype_t.shape[1])
81 | return torch.where(af_t > 0.5, 1 - af_t, af_t)
82 |
83 |
84 | def get_allele_stats(genotype_t):
85 | """Returns allele frequency, minor allele samples, and minor allele counts (row-wise)."""
86 | # allele frequency
87 | n2 = 2 * genotype_t.shape[1]
88 | af_t = genotype_t.sum(1) / n2
89 | # minor allele samples and counts
90 | ix_t = af_t <= 0.5
91 | m = genotype_t > 0.5
92 | a = m.sum(1).int()
93 | b = (genotype_t < 1.5).sum(1).int()
94 | ma_samples_t = torch.where(ix_t, a, b)
95 | a = (genotype_t * m.float()).sum(1).int()
96 | # a = (genotype_t * m.float()).sum(1).round().int() # round for missing/imputed genotypes
97 | ma_count_t = torch.where(ix_t, a, n2-a)
98 | return af_t, ma_samples_t, ma_count_t
99 |
100 |
101 | def filter_maf(genotypes_t, variant_ids, maf_threshold, alleles=2):
102 | """Calculate MAF and filter genotypes that don't pass threshold"""
103 | af_t = genotypes_t.sum(1) / (alleles * genotypes_t.shape[1])
104 | maf_t = torch.where(af_t > 0.5, 1 - af_t, af_t)
105 | if maf_threshold > 0:
106 | mask_t = maf_t >= maf_threshold
107 | genotypes_t = genotypes_t[mask_t]
108 | variant_ids = variant_ids[mask_t.cpu().numpy().astype(bool)]
109 | af_t = af_t[mask_t]
110 | return genotypes_t, variant_ids, af_t
111 |
112 |
113 | def filter_maf_interaction(genotypes_t, interaction_mask_t=None, maf_threshold_interaction=0.05):
114 | # filter monomorphic sites (to avoid colinearity)
115 | mask_t = ~((genotypes_t==0).all(1) | (genotypes_t==1).all(1) | (genotypes_t==2).all(1))
116 | if interaction_mask_t is not None:
117 | upper_t = calculate_maf(genotypes_t[:, interaction_mask_t]) >= maf_threshold_interaction - 1e-7
118 | lower_t = calculate_maf(genotypes_t[:,~interaction_mask_t]) >= maf_threshold_interaction - 1e-7
119 | mask_t = mask_t & upper_t & lower_t
120 | genotypes_t = genotypes_t[mask_t]
121 | return genotypes_t, mask_t
122 |
123 |
124 | def impute_mean(genotypes_t, missing=-9):
125 | """Impute missing genotypes to mean"""
126 | m = genotypes_t == missing
127 | ix = torch.nonzero(m, as_tuple=True)[0]
128 | if len(ix) > 0:
129 | a = genotypes_t.sum(1)
130 | b = m.sum(1).float()
131 | mu = (a - missing*b) / (genotypes_t.shape[1] - b)
132 | genotypes_t[m] = mu[ix]
133 |
134 |
135 | def center_normalize(M_t, dim=0):
136 | """Center and normalize M"""
137 | N_t = M_t - M_t.mean(dim=dim, keepdim=True)
138 | return N_t / torch.sqrt(torch.pow(N_t, 2).sum(dim=dim, keepdim=True))
139 |
140 |
141 | def calculate_corr(genotype_t, phenotype_t, residualizer=None, return_var=False):
142 | """Calculate correlation between normalized residual genotypes and phenotypes"""
143 |
144 | # residualize
145 | if residualizer is not None:
146 | genotype_res_t = residualizer.transform(genotype_t) # variants x samples
147 | phenotype_res_t = residualizer.transform(phenotype_t) # phenotypes x samples
148 | else:
149 | genotype_res_t = genotype_t
150 | phenotype_res_t = phenotype_t
151 |
152 | if return_var:
153 | genotype_var_t = genotype_res_t.var(1)
154 | phenotype_var_t = phenotype_res_t.var(1)
155 |
156 | # center and normalize
157 | genotype_res_t = center_normalize(genotype_res_t, dim=1)
158 | phenotype_res_t = center_normalize(phenotype_res_t, dim=1)
159 |
160 | # correlation
161 | if return_var:
162 | return torch.mm(genotype_res_t, phenotype_res_t.t()), genotype_var_t, phenotype_var_t
163 | else:
164 | return torch.mm(genotype_res_t, phenotype_res_t.t())
165 |
166 |
167 | def get_t_pval(t, df, log=False):
168 | """
169 | Get p-value corresponding to t statistic and degrees of freedom (df). t and/or df can be arrays.
170 | If log=True, returns -log10(P).
171 | """
172 | if not log:
173 | return 2 * stats.t.cdf(-abs(t), df)
174 | else:
175 | if has_rpy2:
176 | return -(rfunc.t_cdf(-abs(t), df, lower_tail=True, log=True) + np.log(2)) * np.log10(np.e)
177 | else:
178 | raise ValueError("R and rpy2 are required to compute -log10(P)")
179 |
180 |
181 | def calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t, residualizer=None,
182 | return_sparse=False, tstat_threshold=None, variant_ids=None):
183 | """
184 | Solve y ~ g + i + g:i, where i is an interaction vector or matrix
185 |
186 | Inputs
187 | genotypes_t: [num_genotypes x num_samples]
188 | phenotypes_t: [num_phenotypes x num_samples]
189 | interaction_t: [num_samples x num_interactions]
190 |
191 | Outputs
192 | if return_sparse is False (default):
193 | tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t
194 | tstat_t, b_t, b_se_t columns: [g, i_1 ... i_n, gi_1, ... gi_n]
195 | where n is the number of interactions
196 | if return_sparse is True:
197 | tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix
198 | ix: indexes [genotype, phenotype]
199 | """
200 | ng, ns = genotypes_t.shape
201 | nps = phenotypes_t.shape[0]
202 | ni = interaction_t.shape[1]
203 |
204 | # centered inputs
205 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True) # genotypes x samples
206 | gi_t = (genotypes_t.unsqueeze(2) * interaction_t.unsqueeze(0)) # genotypes x samples x interactions
207 | gi0_t = gi_t - gi_t.mean(1, keepdim=True) # mean across samples
208 | i0_t = interaction_t - interaction_t.mean(0) # samples x interactions
209 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True) # 1 x samples
210 |
211 | # residualize rows
212 | if residualizer is not None:
213 | p0_t = residualizer.transform(p0_t, center=False)
214 | g0_t = residualizer.transform(g0_t, center=False)
215 | i0_t = residualizer.transform(i0_t.t(), center=False).t()
216 | for k in range(i0_t.shape[1]):
217 | gi0_t[..., k] = residualizer.transform(gi0_t[..., k], center=False)
218 | i0_t = i0_t.repeat(ng, 1, 1)
219 |
220 | # regression (in float; loss of precision may occur in edge cases)
221 | X_t = torch.cat([g0_t.unsqueeze(-1), i0_t, gi0_t], 2) # ng x ns x (1+2*ni)
222 | try:
223 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x (1+2*ni) x (1+2*ni)
224 | except Exception as e:
225 | if variant_ids is not None and len(e.args) >= 1:
226 | i = int(re.findall('For batch (\d+)', str(e))[0])
227 | e.args = (e.args[0] + f'\n Likely problematic variant: {variant_ids[i]} ',) + e.args[1:]
228 | raise
229 |
230 | p0_tile_t = p0_t.unsqueeze(0).expand([ng, *p0_t.shape]) # ng x np x ns
231 |
232 | # calculate b, b_se
233 | # [(ng x nb x nb) x (ng x nb x ns)] x (ng x ns x np) = (ng x nb x np)
234 | b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), torch.transpose(p0_tile_t, 1, 2))
235 | nb = b_t.shape[1]
236 | # residualizer.dof already includes intercept, b_g, add b_i and b_gi for each interaction
237 | if residualizer is not None:
238 | dof = residualizer.dof - 2*ni
239 | else:
240 | dof = phenotypes_t.shape[1] - 2 - 2*ni
241 | if nps == 1: # single phenotype case
242 | r_t = torch.matmul(X_t, b_t).squeeze() - p0_t
243 | rss_t = (r_t*r_t).sum(1)
244 | b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()] * rss_t.unsqueeze(1) / dof)
245 | b_t = b_t.squeeze(2)
246 | # r_t = tf.squeeze(tf.matmul(X_t, b_t)) - p0_t # (ng x ns x 3) x (ng x 3 x 1)
247 | # rss_t = tf.reduce_sum(tf.multiply(r_t, r_t), axis=1)
248 | # b_se_t = tf.sqrt( tf.matrix_diag_part(Xinv) * tf.expand_dims(rss_t, 1) / dof )
249 | else:
250 | # b_t = tf.matmul(p0_tile_t, tf.matmul(Xinv, X_t, transpose_b=True), transpose_b=True)
251 | # convert to ng x np x 3??
252 | r_t = torch.matmul(X_t, b_t) - torch.transpose(p0_tile_t, 1, 2) # (ng x ns x np)
253 | rss_t = (r_t*r_t).sum(1) # ng x np
254 | b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof)
255 | # b_se_t = tf.sqrt(tf.tile(tf.expand_dims(tf.matrix_diag_part(Xinv), 2), [1,1,nps]) * tf.tile(tf.expand_dims(rss_t, 1), [1,3,1]) / dof) # (ng x 3) -> (ng x 3 x np)
256 |
257 | tstat_t = (b_t.double() / b_se_t.double()).float() # (ng x nb x np)
258 |
259 | # tdist = tfp.distributions.StudentT(np.float64(dof), loc=np.float64(0.0), scale=np.float64(1.0))
260 | if not return_sparse:
261 | # calculate pval
262 | # pval_t = tf.scalar_mul(2, tdist.cdf(-tf.abs(tstat_t))) # (ng x 3 x np)
263 | af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t)
264 | return tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t
265 |
266 | else: # sparse output
267 | if ni > 1:
268 | raise NotImplementedError("Sparse mode not yet supported for >1 interactions")
269 | af_t = genotypes_t.sum(1) / (2*ns)
270 | tstat_g_t = tstat_t[:,0,:] # genotypes x phenotypes
271 | tstat_i_t = tstat_t[:,1,:]
272 | tstat_gi_t = tstat_t[:,2,:]
273 | m = tstat_gi_t.abs() >= tstat_threshold
274 | tstat_g_t = tstat_g_t[m]
275 | tstat_i_t = tstat_i_t[m]
276 | tstat_gi_t = tstat_gi_t[m]
277 | ix = m.nonzero(as_tuple=False) # indexes: [genotype, phenotype]
278 | return tstat_g_t, tstat_i_t, tstat_gi_t, af_t[ix[:,0]], ix
279 |
280 |
281 | def linreg(X_t, y_t, dtype=torch.float64):
282 | """
283 | Robust linear regression. Solves y = Xb, standardizing X.
284 | The first column of X must be the intercept.
285 | """
286 | x_std_t = X_t.std(0)
287 | x_mean_t = X_t.mean(0)
288 | x_std_t[0] = 1
289 | x_mean_t[0] = 0
290 |
291 | # standardize X
292 | Xtilde_t = (X_t - x_mean_t) / x_std_t
293 |
294 | # regression
295 | XtX_t = torch.matmul(Xtilde_t.T, Xtilde_t)
296 | Xty_t = torch.matmul(Xtilde_t.T, y_t)
297 | b_t = torch.linalg.solve(XtX_t, Xty_t.unsqueeze(-1))
298 | b_t = b_t.squeeze()
299 |
300 | # compute s.e.
301 | dof = X_t.shape[0] - X_t.shape[1]
302 | r_t = y_t - torch.matmul(Xtilde_t, b_t)
303 | sigma2_t = (r_t*r_t).sum() / dof
304 | XtX_inv_t = torch.linalg.solve(XtX_t, torch.eye(X_t.shape[1], dtype=dtype).to(X_t.device))
305 | var_b_t = sigma2_t * XtX_inv_t
306 | b_se_t = torch.sqrt(torch.diag(var_b_t))
307 |
308 | # rescale
309 | b_t /= x_std_t
310 | b_se_t /= x_std_t
311 |
312 | # adjust intercept
313 | b_t[0] -= torch.sum(x_mean_t * b_t)
314 | ms_t = x_mean_t / x_std_t
315 | b_se_t[0] = torch.sqrt(b_se_t[0]**2 + torch.matmul(torch.matmul(ms_t.T, var_b_t), ms_t))
316 |
317 | return b_t, b_se_t
318 |
319 |
320 | def filter_covariates(covariates_t, log_counts_t, tstat_threshold=2):
321 | """
322 | Inputs:
323 | covariates0_t: covariates matrix (samples x covariates)
324 | including genotype PCs, PEER factors, etc.
325 | ** with intercept in first column **
326 | log_counts_t: counts vector (samples)
327 | """
328 | assert (covariates_t[:,0] == 0).all()
329 | b_t, b_se_t = linreg(covariates_t, log_counts_t)
330 | tstat_t = b_t / b_se_t
331 | m = tstat_t.abs() > tstat_threshold
332 | m[0] = False
333 | return covariates_t[:, m]
334 |
335 |
336 | #------------------------------------------------------------------------------
337 | # Functions for beta-approximating empirical p-values
338 | #------------------------------------------------------------------------------
339 | def pval_from_corr(r2, dof, logp=False):
340 | tstat2 = dof * r2 / (1 - r2)
341 | return get_t_pval(np.sqrt(tstat2), dof, log=logp)
342 |
343 |
344 | def beta_shape_1_from_dof(r2, dof):
345 | """compute the Beta shape 1 parameter from moment matching"""
346 | pval = pval_from_corr(r2, dof)
347 | mean = np.mean(pval)
348 | var = np.var(pval)
349 | return mean * (mean * (1.0-mean) / var - 1.0)
350 |
351 |
352 | def beta_log_likelihood(x, shape1, shape2):
353 | """negative log-likelihood of beta distribution"""
354 | logbeta = loggamma(shape1) + loggamma(shape2) - loggamma(shape1+shape2)
355 | return (1.0-shape1)*np.sum(np.log(x)) + (1.0-shape2)*np.sum(np.log(1.0-x)) + len(x)*logbeta
356 |
357 |
358 | def fit_beta_parameters(r2_perm, dof_init, tol=1e-4, return_minp=False):
359 | """
360 | r2_perm: array of max. r2 values from permutations
361 | dof_init: degrees of freedom
362 | """
363 | try:
364 | # Find the degrees of freedom such that the first beta parameter is
365 | # close to 1, by finding the root where the log of the beta parameter
366 | # as a function of r2_perm and dof is 0. Optimizing log(beta shape 1)
367 | # with a parameterization of log(dof) makes this close to a linear
368 | # function.
369 | log_true_dof = scipy.optimize.newton(lambda x: np.log(beta_shape_1_from_dof(r2_perm, np.exp(x))),
370 | np.log(dof_init), tol=tol, maxiter=50)
371 | true_dof = np.exp(log_true_dof)
372 | except:
373 | # fall back to minimization
374 | print('WARNING: scipy.optimize.newton failed to converge (running scipy.optimize.minimize)')
375 | res = scipy.optimize.minimize(lambda x: np.abs(beta_shape_1_from_dof(r2_perm, x) - 1),
376 | dof_init, method='Nelder-Mead', tol=tol)
377 | true_dof = res.x[0]
378 |
379 | pval = pval_from_corr(r2_perm, true_dof)
380 | mean, var = np.mean(pval), np.var(pval)
381 | beta_shape1 = mean * (mean * (1 - mean) / var - 1)
382 | beta_shape2 = beta_shape1 * (1/mean - 1)
383 | res = scipy.optimize.minimize(lambda s: beta_log_likelihood(pval, s[0], s[1]), [beta_shape1, beta_shape2], method='Nelder-Mead', tol=tol)
384 | beta_shape1, beta_shape2 = res.x
385 | if return_minp:
386 | return beta_shape1, beta_shape2, true_dof, pval
387 | else:
388 | return beta_shape1, beta_shape2, true_dof
389 |
390 |
391 | def calculate_beta_approx_pval(r2_perm, r2_nominal, dof_init, tol=1e-4):
392 | """
393 | r2_nominal: nominal max. r2 (scalar or array)
394 | r2_perm: array of max. r2 values from permutations
395 | dof_init: degrees of freedom
396 | """
397 | beta_shape1, beta_shape2, true_dof = fit_beta_parameters(r2_perm, dof_init, tol)
398 | pval_true_dof = pval_from_corr(r2_nominal, true_dof)
399 | pval_beta = stats.beta.cdf(pval_true_dof, beta_shape1, beta_shape2)
400 | return pval_beta, beta_shape1, beta_shape2, true_dof, pval_true_dof
401 |
402 | #------------------------------------------------------------------------------
403 | # i/o functions
404 | #------------------------------------------------------------------------------
405 |
406 | def read_phenotype_bed(phenotype_bed):
407 | """Load phenotype BED file as phenotype and position DataFrames"""
408 | if phenotype_bed.lower().endswith(('.bed.gz', '.bed')):
409 | phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str})
410 | elif phenotype_bed.lower().endswith('.bed.parquet'):
411 | phenotype_df = pd.read_parquet(phenotype_bed)
412 | phenotype_df.set_index(phenotype_df.columns[3], inplace=True)
413 | else:
414 | raise ValueError('Unsupported file type.')
415 | phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True)
416 |
417 | phenotype_df['start'] += 1 # change to 1-based
418 | pos_df = phenotype_df[['chr', 'start', 'end']]
419 | phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True)
420 |
421 | # make sure BED file is properly sorted
422 | assert pos_df.equals(
423 | pos_df.groupby('chr', sort=False, group_keys=False).apply(lambda x: x.sort_values(['start', 'end']))
424 | ), "Positions in BED file must be sorted."
425 |
426 | if (pos_df['start'] == pos_df['end']).all():
427 | pos_df = pos_df[['chr', 'end']].rename(columns={'end':'pos'})
428 |
429 | return phenotype_df, pos_df
430 |
--------------------------------------------------------------------------------
/tensorqtl/eigenmt.py:
--------------------------------------------------------------------------------
1 | """eigenmt.py: Re-implementation of eigenMT (Davis et al., AJHG, 2016)"""
2 |
3 | __author__ = "Francois Aguet"
4 | __copyright__ = "Copyright 2019, The Broad Institute"
5 | __license__ = "BSD3"
6 |
7 | import torch
8 | import numpy as np
9 | import pandas as pd
10 | import time
11 | import os
12 | import sys
13 | from collections import OrderedDict
14 |
15 | sys.path.insert(1, os.path.dirname(__file__))
16 | import genotypeio
17 | from core import *
18 |
19 |
20 | def lw_shrink(X_t):
21 | """
22 | Estimates the shrunk Ledoit-Wolf covariance matrix
23 |
24 | Args:
25 | X_t: samples x variants
26 |
27 | Returns:
28 | shrunk_cov_t: shrunk covariance
29 | shrinkage_t: shrinkage coefficient
30 |
31 | Adapted from scikit-learn:
32 | https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/covariance/shrunk_covariance_.py
33 | """
34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35 |
36 | if len(X_t.shape) == 2:
37 | n_samples, n_features = X_t.shape # samples x variants
38 | X_t = X_t - X_t.mean(0)
39 | X2_t = X_t.pow(2)
40 | emp_cov_trace_sum = X2_t.sum() / n_samples
41 | delta_ = torch.mm(X_t.t(), X_t).pow(2).sum() / n_samples**2
42 | beta_ = torch.mm(X2_t.t(), X2_t).sum()
43 | beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
44 | delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features
45 | delta /= n_features
46 | beta = torch.min(beta, delta)
47 | shrinkage_t = 0 if beta == 0 else beta / delta
48 | emp_cov_t = torch.mm(X_t.t(), X_t) / n_samples
49 | mu_t = torch.trace(emp_cov_t) / n_features
50 | shrunk_cov_t = (1. - shrinkage_t) * emp_cov_t
51 | shrunk_cov_t.view(-1)[::n_features + 1] += shrinkage_t * mu_t # add to diagonal
52 | else: # broadcast along first dimension
53 | n_samples, n_features = X_t.shape[1:] # samples x variants
54 | X_t = X_t - X_t.mean(1, keepdim=True)
55 | X2_t = X_t.pow(2)
56 | emp_cov_trace_sum = X2_t.sum([1,2]) / n_samples
57 | delta_ = torch.matmul(torch.transpose(X_t, 1, 2), X_t).pow(2).sum([1,2]) / n_samples**2
58 | beta_ = torch.matmul(torch.transpose(X2_t, 1, 2), X2_t).sum([1,2])
59 | beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
60 | delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features
61 | delta /= n_features
62 | beta = torch.min(beta, delta)
63 | shrinkage_t = torch.where(beta==0, torch.zeros(beta.shape).to(device), beta/delta)
64 | emp_cov_t = torch.matmul(torch.transpose(X_t, 1, 2), X_t) / n_samples
65 | mu_t = torch.diagonal(emp_cov_t, dim1=1, dim2=2).sum(1) / n_features
66 | shrunk_cov_t = (1 - shrinkage_t.reshape([shrinkage_t.shape[0], 1, 1])) * emp_cov_t
67 |
68 | ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(X_t.shape[0])])).to(device)
69 | shrunk_cov_t.view(-1)[ix] += (shrinkage_t * mu_t).unsqueeze(-1) # add to diagonal
70 |
71 | return shrunk_cov_t, shrinkage_t
72 |
73 |
74 | def find_num_eigs(eigenvalues, variance, var_thresh=0.99):
75 | """Returns the number of eigenvalues required to reach threshold of variance explained."""
76 | eigenvalues = np.sort(eigenvalues)[::-1]
77 | running_sum = 0
78 | counter = 0
79 | while running_sum < variance * var_thresh:
80 | running_sum += eigenvalues[counter]
81 | counter += 1
82 | return counter
83 |
84 |
85 | def compute_tests(genotypes_t, var_thresh=0.99, variant_window=200):
86 | """determine effective number of independent variants (M_eff)"""
87 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88 |
89 | # break into windows
90 | windows = torch.split(genotypes_t, variant_window)
91 |
92 | if len(windows)>1:
93 | shrunk_cov_t, shrinkage_t = lw_shrink(torch.transpose(torch.stack(windows[:-1]), 1, 2))
94 |
95 | n_samples, n_features = windows[0].T.shape
96 | # indices of diagonals
97 | ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(shrunk_cov_t.shape[0])])).to(device)
98 | shrunk_precision_t = torch.zeros(shrunk_cov_t.shape).to(device)
99 | shrunk_precision_t.view(-1)[ix] = shrunk_cov_t.view(-1)[ix].pow(-0.5)
100 | shrunk_cor_t = torch.matmul(torch.matmul(shrunk_precision_t, shrunk_cov_t), shrunk_precision_t)
101 | # eigenvalues_t,_ = torch.symeig(shrunk_cor_t, eigenvectors=False) # will be deprecated
102 | eigenvalues_t = torch.linalg.eigvalsh(shrunk_cor_t) # ~2x slower than symeig with 1.10.0+cu102 and 2.0.1+cu118
103 |
104 | # last window
105 | shrunk_cov0_t, shrinkage0_t = lw_shrink(windows[-1].t())
106 | shrunk_precision0_t = torch.diag(torch.diag(shrunk_cov0_t).pow(-0.5))
107 | shrunk_cor0_t = torch.mm(torch.mm(shrunk_precision0_t, shrunk_cov0_t), shrunk_precision0_t)
108 | # eigenvalues0_t,_ = torch.symeig(shrunk_cor0_t, eigenvectors=False)
109 | eigenvalues0_t = torch.linalg.eigvalsh(shrunk_cor0_t)
110 |
111 | if len(windows) > 1:
112 | eigenvalues = list(eigenvalues_t.cpu().numpy())
113 | eigenvalues.append(eigenvalues0_t.cpu().numpy())
114 | else:
115 | eigenvalues = [eigenvalues0_t.cpu().numpy()]
116 |
117 | m_eff = 0
118 | for ev,m in zip(eigenvalues, [i.shape[0] for i in windows]):
119 | ev[ev < 0] = 0
120 | m_eff += find_num_eigs(ev, m, var_thresh=var_thresh)
121 |
122 | return m_eff
123 |
124 |
125 |
126 | def run_eigenmt(genotype_df, variant_df, phenotype_df, phenotype_pos_df, interaction_s=None,
127 | maf_threshold=0, var_thresh=0.99, variant_window=200, window=1000000, verbose=True, logger=None):
128 | """Standalone function for computing eigenMT correction.
129 |
130 | Returns the number of tests for each gene
131 | """
132 |
133 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134 |
135 | if logger is None:
136 | logger = SimpleLogger()
137 |
138 | logger.write('eigenMT: estimating number of independent variants tested for each phenotype')
139 |
140 | logger.write('cis-QTL mapping: empirical p-values for phenotypes')
141 | logger.write(f' * {phenotype_df.shape[1]} samples')
142 | logger.write(f' * {phenotype_df.shape[0]} phenotypes')
143 | logger.write(f' * {genotype_df.shape[0]} variants')
144 |
145 | if interaction_s is not None and maf_threshold > 0:
146 | interaction_mask_t = torch.BoolTensor(interaction_s >= interaction_s.median()).to(device)
147 | else:
148 | interaction_mask_t = None
149 |
150 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
151 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
152 |
153 | igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window)
154 | start_time = time.time()
155 | m_eff = OrderedDict()
156 | for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):
157 |
158 | # copy genotypes to GPU
159 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
160 | genotypes_t = genotypes_t[:,genotype_ix_t]
161 | impute_mean(genotypes_t)
162 |
163 | if interaction_s is None:
164 | mask_t = calculate_maf(genotypes_t) >= maf_threshold
165 | genotypes_t = genotypes_t[mask_t]
166 | else:
167 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold)
168 |
169 | m_eff[phenotype_id] = compute_tests(genotypes_t, var_thresh=var_thresh, variant_window=variant_window)
170 |
171 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min')
172 | return pd.Series(m_eff)
173 |
174 |
175 | def padjust_bh(p):
176 | """Benjamini-Hochberg adjusted p-values"""
177 | if not np.all(np.isfinite(p)):
178 | raise ValueError('P values must be finite.')
179 | n = len(p)
180 | i = np.arange(n,0,-1)
181 | o = np.argsort(p)[::-1]
182 | ro = np.argsort(o)
183 | return np.minimum(1, np.minimum.accumulate(np.float64(n)/i * np.array(p)[o]))[ro]
184 |
--------------------------------------------------------------------------------
/tensorqtl/genotypeio.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import tempfile
3 | import numpy as np
4 | import subprocess
5 | import os
6 | import gzip
7 | import sys
8 | import threading
9 | import queue
10 | import bisect
11 | from pandas_plink import read_plink
12 |
13 | sys.path.insert(1, os.path.dirname(__file__))
14 | from core import *
15 |
16 | try:
17 | import pgen
18 | except ImportError as e:
19 | pgen = None
20 |
21 |
22 | gt_to_dosage_dict = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan,
23 | '0|0':0, '0|1':1, '1|0':1, '1|1':2, '.|.':np.nan}
24 |
25 |
26 | def _check_dependency(name):
27 | e = subprocess.call(f"which {name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
28 | if e != 0:
29 | raise RuntimeError(f"External dependency '{name}' not installed")
30 |
31 |
32 | def print_progress(k, n, entity):
33 | s = f'\r processing {entity} {k}/{n}'
34 | if k == n:
35 | s += '\n'
36 | sys.stdout.write(s)
37 | sys.stdout.flush()
38 |
39 |
40 | class BackgroundGenerator(threading.Thread):
41 | # Adapted from https://github.com/justheuristic/prefetch_generator
42 | def __init__(self, generator, max_prefetch=10):
43 | threading.Thread.__init__(self)
44 | self.queue = queue.Queue(max_prefetch)
45 | self.generator = generator
46 | self.daemon = True
47 | self.start()
48 |
49 | def run(self):
50 | try:
51 | for item in self.generator:
52 | self.queue.put(item)
53 | except Exception as exception:
54 | self.queue.put(exception)
55 | self.queue.put(None)
56 |
57 | def next(self):
58 | next_item = self.queue.get()
59 | if next_item is None:
60 | self.join()
61 | raise StopIteration
62 | if isinstance(next_item, Exception):
63 | self.join()
64 | raise next_item
65 | return next_item
66 |
67 | def __next__(self):
68 | return self.next()
69 |
70 | def __iter__(self):
71 | return self
72 |
73 | class background:
74 | def __init__(self, max_prefetch=10):
75 | self.max_prefetch = max_prefetch
76 | def __call__(self,gen):
77 | def bg_generator(*args,**kwargs):
78 | return BackgroundGenerator(gen(*args,**kwargs), max_prefetch=self.max_prefetch)
79 | return bg_generator
80 |
81 |
82 | #------------------------------------------------------------------------------
83 | # Functions for writing VCFs
84 | #------------------------------------------------------------------------------
85 | def _get_vcf_opener(vcfpath):
86 | if vcfpath.endswith('.vcf.gz'):
87 | return gzip.open(vcfpath, 'rt')
88 | else:
89 | return open(vcfpath)
90 |
91 |
92 | def get_sample_ids(vcfpath):
93 | """Get sample IDs from VCF"""
94 | with _get_vcf_opener(vcfpath) as vcf:
95 | for header in vcf:
96 | if header[:2] == '##': continue
97 | break
98 | return header.strip().split('\t')[9:]
99 |
100 |
101 | def parse_genotypes(x, field='GT'):
102 | """Convert list of genotypes (str) to np.float32"""
103 | if field == 'GT':
104 | g = np.float32([gt_to_dosage_dict[i] for i in x])
105 | elif field == 'DS':
106 | g = np.float32(x)
107 | return g
108 |
109 |
110 | def _get_field_ix(line, field):
111 | """Get position of field ('GT' or 'DS') in FORMAT"""
112 | fmt = line[8].split(':')
113 | if field not in fmt:
114 | raise ValueError(f'FORMAT field does not contain {field}')
115 | return fmt.index(field)
116 |
117 | #------------------------------------------------------------------------------
118 | # Functions for loading regions/variants from VCFs
119 | #------------------------------------------------------------------------------
120 | def _impute_mean(g, missing=-9, verbose=False):
121 | """Impute rows to mean (in place)"""
122 | if not g.dtype in [np.float32, np.float64]:
123 | raise ValueError('Input dtype must be np.float32 or np.float64')
124 | n = 0
125 | for i in np.where((g == missing).any(1))[0]:
126 | ix = g[i] == missing
127 | g[i][ix] = np.mean(g[i][~ix])
128 | n += 1
129 | if verbose and n > 0:
130 | print(f' imputed at least 1 sample in {n}/{g.shape[0]} sites')
131 |
132 |
133 | class PlinkReader(object):
134 | def __init__(self, plink_prefix_path, select_samples=None, include_variants=None,
135 | exclude_variants=None, exclude_chrs=None, verbose=True, dtype=np.int8):
136 | """
137 | Class for reading genotypes from PLINK bed files
138 |
139 | plink_prefix_path: prefix to PLINK bed,bim,fam files
140 | select_samples: specify a subset of samples
141 |
142 | Notes:
143 | Use this command to convert a VCF to PLINK format:
144 | plink2 --make-bed \
145 | --output-chr chrM \
146 | --vcf ${plink_prefix_path}.vcf.gz \
147 | --out ${plink_prefix_path}
148 |
149 | If using plink v1, the --keep-allele-order flag must be included.
150 |
151 | Uses read_plink from pandas_plink.
152 | """
153 |
154 | self.bim, self.fam, self.bed = read_plink(plink_prefix_path, verbose=verbose)
155 | self.bed = 2 - self.bed # flip allele order: PLINK uses REF as effect allele
156 | if dtype == np.int8:
157 | self.bed[np.isnan(self.bed)] = -9 # convert missing (NaN) to -9 for int8
158 | self.bed = self.bed.astype(dtype, copy=False)
159 | self.sample_ids = self.fam['iid'].tolist()
160 | if select_samples is not None:
161 | ix = [self.sample_ids.index(i) for i in select_samples]
162 | self.fam = self.fam.loc[ix]
163 | self.bed = self.bed[:,ix]
164 | self.sample_ids = self.fam['iid'].tolist()
165 | if include_variants is not None:
166 | m = self.bim['snp'].isin(include_variants).values
167 | self.bed = self.bed[m,:]
168 | self.bim = self.bim[m]
169 | self.bim.reset_index(drop=True, inplace=True)
170 | self.bim['i'] = self.bim.index
171 | if exclude_variants is not None:
172 | m = ~self.bim['snp'].isin(exclude_variants).values
173 | self.bed = self.bed[m,:]
174 | self.bim = self.bim[m]
175 | self.bim.reset_index(drop=True, inplace=True)
176 | self.bim['i'] = self.bim.index
177 | if exclude_chrs is not None:
178 | m = ~self.bim['chrom'].isin(exclude_chrs).values
179 | self.bed = self.bed[m,:]
180 | self.bim = self.bim[m]
181 | self.bim.reset_index(drop=True, inplace=True)
182 | self.bim['i'] = self.bim.index
183 | self.n_samples = self.fam.shape[0]
184 | self.chrs = list(self.bim['chrom'].unique())
185 | self.variant_pos = {i:g['pos'] for i,g in self.bim.set_index('snp')[['chrom', 'pos']].groupby('chrom')}
186 | self.variant_pos_dict = self.bim.set_index('snp')['pos'].to_dict()
187 |
188 | def get_region_index(self, region_str, return_pos=False):
189 | s = region_str.split(':')
190 | chrom = s[0]
191 | c = self.bim[self.bim['chrom'] == chrom]
192 | if len(s) > 1:
193 | start, end = s[1].split('-')
194 | start = int(start)
195 | end = int(end)
196 | c = c[(c['pos'] >= start) & (c['pos'] <= end)]
197 | if return_pos:
198 | return c['i'].values, c.set_index('snp')['pos']
199 | else:
200 | return c['i'].values
201 |
202 | def get_region(self, region_str, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
203 | """Get genotypes for a region defined by 'chr:start-end' or 'chr'"""
204 | ix, pos_s = self.get_region_index(region_str, return_pos=True)
205 | g = self.bed[ix, :].compute().astype(dtype)
206 | if sample_ids is not None:
207 | ix = [self.sample_ids.index(i) for i in sample_ids]
208 | g = g[:, ix]
209 | if impute:
210 | _impute_mean(g, verbose=verbose)
211 | return g, pos_s
212 |
213 | def get_genotypes(self, variant_ids, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
214 | """Load genotypes for selected variant IDs"""
215 | c = self.bim[self.bim['snp'].isin(variant_ids)]
216 | g = self.bed[c.i.values, :].compute().astype(dtype)
217 | if sample_ids is not None:
218 | ix = [self.sample_ids.index(i) for i in sample_ids]
219 | g = g[:, ix]
220 | if impute:
221 | _impute_mean(g, verbose=verbose)
222 | return g, c.set_index('snp')['pos']
223 |
224 | def get_genotype(self, variant_id, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
225 | """Load genotypes for a single variant ID as pd.Series"""
226 | g,_ = self.get_genotypes([variant_id], sample_ids=sample_ids, impute=impute, verbose=verbose, dtype=dtype)
227 | if sample_ids is None:
228 | return pd.Series(g[0], index=self.fam['iid'], name=variant_id)
229 | else:
230 | return pd.Series(g[0], index=sample_ids, name=variant_id)
231 |
232 | def load_genotypes(self):
233 | """Load all genotypes into memory, as pd.DataFrame"""
234 | return pd.DataFrame(self.bed.compute(), index=self.bim['snp'], columns=self.fam['iid'])
235 |
236 |
237 | def load_genotypes(genotype_path, select_samples=None, dosages=False):
238 | """Load all genotypes into a dataframe"""
239 | if all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]):
240 | if pgen is None:
241 | raise ImportError('Pgenlib must be installed to use PLINK 2 pgen/psam/pvar files.')
242 | pgr = pgen.PgenReader(genotype_path, select_samples=select_samples)
243 | variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']]
244 | if dosages:
245 | genotype_df = pgr.load_dosages()
246 | else:
247 | genotype_df = pgr.load_genotypes()
248 | elif all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['bed', 'bim', 'fam']]):
249 | pr = PlinkReader(genotype_path, select_samples=select_samples, dtype=np.int8)
250 | genotype_df = pr.load_genotypes()
251 | variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
252 | elif genotype_path.endswith(('.bed.parquet', '.bed.gz', '.bed')):
253 | genotype_df, variant_df = read_phenotype_bed(genotype_path)
254 | assert variant_df.columns[1] == 'pos', "The BED file must define a single position for each variant, with start + 1 == end."
255 | variant_df.columns = ['chrom', 'pos']
256 | elif genotype_path.endswith('.parquet'):
257 | genotype_df = pd.read_parquet(genotype_path)
258 | variant_df = None
259 | elif genotype_path.endswith('.gz'):
260 | with gzip.open(genotype_path, 'rt') as f:
261 | header = f.readline().strip().split('\t')
262 | dtypes = {i:np.float32 for i in header}
263 | dtypes[header[0]] = str
264 | genotype_df = pd.read_csv(genotype_path, sep='\t', index_col=0, dtype=dtypes)
265 | variant_df = None
266 | else:
267 | raise ValueError(f"Failed to load genotypes from {genotype_path}. Supported formats: pgen/psam/pvar, bed/bim/fam, parquet, tsv.gz")
268 | return genotype_df, variant_df
269 |
270 |
271 | def get_vcf_region(region_str, vcfpath, field='GT', sample_ids=None, select_samples=None, impute_missing=True):
272 | """Load VCF region (str: 'chr:start-end') as DataFrame (requires tabix)"""
273 | s = subprocess.check_output(f'tabix {vcfpath} {region_str}', shell=True)
274 | s = s.decode().strip().split('\n')
275 | s = [i.split('\t') for i in s]
276 |
277 | if sample_ids is None:
278 | sample_ids = get_sample_ids(vcfpath)
279 | variant_ids = [i[2] for i in s]
280 | pos_s = pd.Series([int(i[1]) for i in s], index=variant_ids)
281 |
282 | ix = _get_field_ix(s[0], field)
283 | g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s])
284 | df = pd.DataFrame(g, index=variant_ids, columns=sample_ids)
285 |
286 | if select_samples is not None:
287 | df = df[select_samples]
288 |
289 | if impute_missing:
290 | n = 0
291 | for v in df.values:
292 | m = np.isnan(v)
293 | if np.any(m):
294 | v[m] = np.mean(v[~m])
295 | n += 1
296 | if n > 0:
297 | print(f' imputed at least 1 sample in {n} sites')
298 |
299 | return df, pos_s
300 |
301 |
302 | def get_vcf_variants(variant_ids, vcfpath, field='GT', sample_ids=None):
303 | """Load a set of variants in VCF as DataFrame (requires tabix)"""
304 | variant_id_set = set(variant_ids)
305 | with tempfile.NamedTemporaryFile() as regions_file:
306 | df = pd.DataFrame([i.split('_')[:2] for i in variant_id_set], columns=['chr', 'pos'])
307 | df['pos'] = df['pos'].astype(int)
308 | df = df.sort_values(['chr', 'pos'])
309 | df.to_csv(regions_file.name, sep='\t', index=False, header=False)
310 | s = subprocess.check_output(f'tabix {vcfpath} --regions {regions_file.name}', shell=True)
311 | s = s.decode().strip().split('\n')
312 | s = [i.split('\t') for i in s]
313 |
314 | if sample_ids is None:
315 | sample_ids = get_sample_ids(vcfpath)
316 |
317 | ix = _get_field_ix(s[0], field)
318 | g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s])
319 | g = np.array([i for i in g if -1 not in i]) # filter missing here instead of ValueError?
320 |
321 | returned_variant_ids = [i[2] for i in s]
322 | ix = [k for k,i in enumerate(returned_variant_ids) if i in variant_id_set]
323 | g = np.array([g[i] for i in ix])
324 | returned_variant_ids = [returned_variant_ids[i] for i in ix]
325 | return pd.DataFrame(g.astype(np.float32), index=returned_variant_ids, columns=sample_ids)
326 |
327 | #------------------------------------------------------------------------------
328 | # Generator classes for batch processing of genotypes/phenotypes
329 | #------------------------------------------------------------------------------
330 | class GenotypeGeneratorTrans(object):
331 | def __init__(self, genotype_df, batch_size=50000, chr_s=None):
332 | """
333 | Generator for iterating over all variants (trans-scan)
334 |
335 | Inputs:
336 | genotype_df: Dataframe with genotypes (variants x samples)
337 | batch_size: Batch size for GPU processing
338 |
339 | Generates: genotype array (2D), variant ID array
340 | """
341 | self.genotype_df = genotype_df
342 | self.batch_size = batch_size
343 | self.num_batches = int(np.ceil(self.genotype_df.shape[0] / batch_size))
344 | self.batch_indexes = [[i*batch_size, (i+1)*batch_size] for i in range(self.num_batches)]
345 | self.batch_indexes[-1][1] = self.genotype_df.shape[0]
346 | if chr_s is not None:
347 | chroms, chr_ix = np.unique(chr_s, return_index=True)
348 | s = np.argsort(chr_ix)
349 | self.chroms = chroms[s]
350 | chr_ix = list(chr_ix[s]) + [chr_s.shape[0]]
351 | size_s = pd.Series(np.diff(chr_ix), index=self.chroms)
352 | self.chr_batch_indexes = {}
353 | for k,c in enumerate(self.chroms):
354 | num_batches = int(np.ceil(size_s[c] / batch_size))
355 | batch_indexes = [[chr_ix[k]+i*batch_size, chr_ix[k]+(i+1)*batch_size] for i in range(num_batches)]
356 | batch_indexes[-1][1] = chr_ix[k+1]
357 | self.chr_batch_indexes[c] = batch_indexes
358 |
359 | def __len__(self):
360 | return self.num_batches
361 |
362 | @background(max_prefetch=6)
363 | def generate_data(self, chrom=None, verbose=False, enum_start=1):
364 | """Generate batches from genotype data"""
365 | if chrom is None:
366 | batch_indexes = self.batch_indexes
367 | num_batches = self.num_batches
368 | else:
369 | batch_indexes = self.chr_batch_indexes[chrom]
370 | num_batches = np.sum([len(i) for i in self.chr_batch_indexes.values()])
371 |
372 | for k,i in enumerate(batch_indexes, enum_start): # loop through batches
373 | if verbose:
374 | print_progress(k, num_batches, 'batch')
375 | g = self.genotype_df.values[i[0]:i[1]]
376 | ix = self.genotype_df.index[i[0]:i[1]] # variant IDs
377 | yield g, ix
378 |
379 |
380 | def get_cis_ranges(phenotype_pos_df, chr_variant_dfs, window, verbose=True):
381 | """
382 |
383 | start, end indexes (inclusive)
384 | """
385 | # check phenotypes & calculate genotype ranges
386 | # get genotype indexes corresponding to cis-window of each phenotype
387 | if 'pos' in phenotype_pos_df:
388 | phenotype_pos_df = phenotype_pos_df.rename(columns={'pos':'start'})
389 | phenotype_pos_df['end'] = phenotype_pos_df['start']
390 | phenotype_pos_dict = phenotype_pos_df.to_dict(orient='index')
391 |
392 | drop_ids = []
393 | cis_ranges = {}
394 | n = len(phenotype_pos_df)
395 | for k, phenotype_id in enumerate(phenotype_pos_df.index, 1):
396 | if verbose and (k % 1000 == 0 or k == n):
397 | print(f'\r * checking phenotypes: {k}/{n}', end='' if k != n else None)
398 |
399 | pos = phenotype_pos_dict[phenotype_id]
400 | chrom = pos['chr']
401 | m = len(chr_variant_dfs[chrom]['pos'].values)
402 | lb = bisect.bisect_left(chr_variant_dfs[chrom]['pos'].values, pos['start'] - window)
403 | ub = bisect.bisect_right(chr_variant_dfs[chrom]['pos'].values, pos['end'] + window)
404 | if lb != ub:
405 | r = chr_variant_dfs[chrom]['index'].values[[lb, ub - 1]]
406 | else:
407 | r = []
408 |
409 | if len(r) > 0:
410 | cis_ranges[phenotype_id] = r
411 | else:
412 | drop_ids.append(phenotype_id)
413 |
414 | return cis_ranges, drop_ids
415 |
416 |
417 | class InputGeneratorCis(object):
418 | """
419 | Input generator for cis-mapping
420 |
421 | Inputs:
422 | genotype_df: genotype DataFrame (genotypes x samples)
423 | variant_df: DataFrame mapping variant_id (index) to chrom, pos
424 | phenotype_df: phenotype DataFrame (phenotypes x samples)
425 | phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end']
426 | window: cis-window; selects variants within +- cis-window from 'pos' (e.g., TSS for gene-based features)
427 | or within [start-window, end+window] if 'start' and 'end' are present in phenotype_pos_df
428 |
429 | Generates: phenotype array, genotype array (2D), cis-window indices, phenotype ID
430 | """
431 | def __init__(self, genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=1000000):
432 | assert (genotype_df.index == variant_df.index).all()
433 | assert (phenotype_df.index == phenotype_df.index.unique()).all()
434 | self.genotype_df = genotype_df
435 | self.variant_df = variant_df.copy()
436 | self.variant_df['index'] = np.arange(variant_df.shape[0])
437 | self.n_samples = phenotype_df.shape[1]
438 |
439 | # drop phenotypes without genotypes on same contig
440 | variant_chrs = variant_df['chrom'].unique()
441 | phenotype_chrs = phenotype_pos_df['chr'].unique()
442 | self.chrs = [i for i in phenotype_chrs if i in variant_chrs]
443 | m = phenotype_pos_df['chr'].isin(self.chrs)
444 | if any(~m):
445 | print(f' ** dropping {sum(~m)} phenotypes on chrs. without genotypes')
446 | self.phenotype_df = phenotype_df[m]
447 | self.phenotype_pos_df = phenotype_pos_df[m]
448 |
449 | # check for constant phenotypes and drop
450 | m = np.all(self.phenotype_df.values == self.phenotype_df.values[:,[0]], 1)
451 | if m.any():
452 | print(f' ** dropping {np.sum(m)} constant phenotypes')
453 | self.phenotype_df = self.phenotype_df.loc[~m]
454 | self.phenotype_pos_df = self.phenotype_pos_df.loc[~m]
455 |
456 | if len(self.phenotype_df) == 0:
457 | raise ValueError("No phenotypes remain after filters.")
458 |
459 | self.group_s = None
460 | self.window = window
461 |
462 | self.chr_variant_dfs = {c:g[['pos', 'index']] for c,g in self.variant_df.groupby('chrom')}
463 |
464 | # check phenotypes & calculate genotype ranges
465 | # get genotype indexes corresponding to cis-window of each phenotype
466 | self.cis_ranges, drop_ids = get_cis_ranges(self.phenotype_pos_df, self.chr_variant_dfs, self.window)
467 | if len(drop_ids) > 0:
468 | print(f" ** dropping {len(drop_ids)} phenotypes without variants in cis-window")
469 | self.phenotype_df = self.phenotype_df.drop(drop_ids)
470 | self.phenotype_pos_df = self.phenotype_pos_df.drop(drop_ids)
471 | if 'pos' in self.phenotype_pos_df:
472 | self.phenotype_start = self.phenotype_pos_df['pos'].to_dict()
473 | self.phenotype_end = self.phenotype_start
474 | else:
475 | self.phenotype_start = self.phenotype_pos_df['start'].to_dict()
476 | self.phenotype_end = self.phenotype_pos_df['end'].to_dict()
477 | self.n_phenotypes = self.phenotype_df.shape[0]
478 |
479 | if group_s is not None:
480 | self.group_s = group_s.loc[self.phenotype_df.index].copy()
481 | self.n_groups = self.group_s.unique().shape[0]
482 |
483 |
484 | @background(max_prefetch=6)
485 | def generate_data(self, chrom=None, verbose=False):
486 | """
487 | Generate batches from genotype data
488 |
489 | Returns: phenotype array, genotype matrix, genotype index, phenotype ID(s), [group ID]
490 | """
491 | if chrom is None:
492 | phenotype_ids = self.phenotype_df.index
493 | chr_offset = 0
494 | else:
495 | phenotype_ids = self.phenotype_pos_df[self.phenotype_pos_df['chr'] == chrom].index
496 | if self.group_s is None:
497 | offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'], return_index=True))}
498 | else:
499 | offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'][self.group_s.drop_duplicates().index], return_index=True))}
500 | chr_offset = offset_dict[chrom]
501 |
502 | index_dict = {j:i for i,j in enumerate(self.phenotype_df.index)}
503 |
504 | if self.group_s is None:
505 | for k,phenotype_id in enumerate(phenotype_ids, chr_offset+1):
506 | if verbose:
507 | print_progress(k, self.n_phenotypes, 'phenotype')
508 | p = self.phenotype_df.values[index_dict[phenotype_id]]
509 | # p = self.phenotype_df.values[k]
510 | r = self.cis_ranges[phenotype_id]
511 | yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), phenotype_id
512 | else:
513 | gdf = self.group_s[phenotype_ids].groupby(self.group_s, sort=False)
514 | for k,(group_id,g) in enumerate(gdf, chr_offset+1):
515 | if verbose:
516 | print_progress(k, self.n_groups, 'phenotype group')
517 | # check that ranges are the same for all phenotypes within group
518 | assert np.all([self.cis_ranges[g.index[0]][0] == self.cis_ranges[i][0] and self.cis_ranges[g.index[0]][1] == self.cis_ranges[i][1] for i in g.index[1:]])
519 | group_phenotype_ids = g.index.tolist()
520 | # p = self.phenotype_df.loc[group_phenotype_ids].values
521 | p = self.phenotype_df.values[[index_dict[i] for i in group_phenotype_ids]]
522 | r = self.cis_ranges[g.index[0]]
523 | yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), group_phenotype_ids, group_id
524 |
525 |
526 | def get_chunk_size(memory_gb, samples):
527 | """"""
528 | return memory_gb * 1024**3 // samples
529 |
530 |
531 | def generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, chunk_size, window=1000000,
532 | dosages=False, verbose=True):
533 | """
534 | Generate paired genotype-phenotype chunks for large datasets where only a subset of
535 | genotypes can be loaded into memory.
536 |
537 | pgr: pgen.PgenReader
538 | phenotype_df: phenotype DataFrame (phenotypes x samples)
539 | phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end']
540 | chunk_size: maximum number of variants to load into CPU memory
541 | window: cis-window
542 | dosages: load dosages (DS) from genotype files (default: GT)
543 | """
544 | variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']]
545 | cis_ranges, _ = get_cis_ranges(phenotype_pos_df, pgr.variant_dfs, window)
546 | range_df = pd.DataFrame(cis_ranges, index=['start', 'end']).T
547 | range_df = range_df.join(phenotype_pos_df['chr'])
548 |
549 | if chunk_size == 'chr':
550 | chrlen_s = range_df['chr'].value_counts(sort=False)
551 | start_ixs = [0] + chrlen_s.cumsum().tolist()
552 | else:
553 | chunk_size = int(chunk_size)
554 | # check chunk size
555 | max_cis_var = (range_df['end'] - range_df['start'] + 1).max()
556 | if not max_cis_var <= chunk_size:
557 | raise ValueError(f"Max. chunk size must be at least largest cis-window ({max_cis_var})")
558 |
559 | start_ixs = [0]
560 | while start_ixs[-1] < range_df.shape[0]:
561 | end_ix = bisect.bisect_left(range_df['end'].values, range_df['start'].values[start_ixs[-1]] + chunk_size)
562 | start_ixs.append(end_ix)
563 | start_ixs[-1] = range_df.shape[0]
564 |
565 | nchunks = len(start_ixs) - 1
566 | for ci in range(nchunks):
567 | if verbose:
568 | print(f"Processing genotype-phenotype chunk {ci+1}/{nchunks}")
569 | ix = slice(start_ixs[ci], start_ixs[ci+1])
570 | chunk_df = range_df[ix]
571 | if chunk_size == 'chr':
572 | assert (chunk_df['chr'] == chrlen_s.index[ci]).all()
573 | if dosages:
574 | gt_df = pgr.read_dosages_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], dtype=np.float32)
575 | else:
576 | gt_df = pgr.read_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], impute_mean=False, dtype=np.int8)
577 | var_df = variant_df.iloc[chunk_df['start'].values[0]:chunk_df['end'].values[-1]+1]
578 | yield gt_df, var_df, phenotype_df[ix], phenotype_pos_df[ix], ci
579 |
--------------------------------------------------------------------------------
/tensorqtl/mixqtl.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import os
4 | import sys
5 | sys.path.insert(1, os.path.dirname(__file__))
6 | import cis
7 | from core import *
8 |
9 |
10 | def trc(genotypes_t, counts_t, covariates_t=None, select_covariates=True,
11 | count_threshold=0, imputation='offset', mode='standard', return_af=False):
12 | """
13 | Inputs
14 | genotypes_t: dosages (variants x samples)
15 | counts_t: DESeq size factor-normalized read counts
16 | covariates_t: covariates matrix, first column must be intercept
17 | mode: if 'standard', parallel regression for each variant in genotypes_t
18 | if 'multi', multiple regression for all variants in genotypes_t
19 |
20 | Outputs:
21 | t-statistic, beta, beta_se {af, ma_samples, ma_counts} (mode='standard')
22 | beta, beta_se (mode='multi')
23 | """
24 | nonzero_t = counts_t != 0
25 |
26 | if imputation == 'offset':
27 | log_counts_t = counts_t.log1p()
28 | elif imputation == 'half_min':
29 | log_counts_t = counts_t.clone()
30 | log_counts_t[~nonzero_t] = log_counts_t[nonzero_t].min() / 2
31 | log_counts_t = log_counts_t.log()
32 |
33 | if covariates_t is not None:
34 | if select_covariates:
35 | # select significant covariates
36 | b_t, b_se_t = linreg(covariates_t[nonzero_t, :], log_counts_t[nonzero_t], dtype=torch.float32)
37 | tstat_t = b_t / b_se_t
38 | m = tstat_t.abs() > 2
39 | m[0] = True # keep intercept
40 | sel_covariates_t = covariates_t[:, m]
41 | else:
42 | sel_covariates_t = covariates_t
43 |
44 | # Regress out covariates from non-zero counts, and keep zeros.
45 | # This follows the original mixQTL implementation, but may be
46 | # problematic when count_threshold is 0.
47 | residualizer = Residualizer(sel_covariates_t[nonzero_t, 1:]) # exclude intercept
48 | y_t = counts_t.clone()
49 | y_t[nonzero_t] = residualizer.transform(log_counts_t[nonzero_t].reshape(1,-1), center=True)
50 | else:
51 | y_t = log_counts_t
52 |
53 | m_t = counts_t >= count_threshold
54 |
55 | if mode == 'standard':
56 | res = cis.calculate_cis_nominal(genotypes_t[:, m_t] / 2, y_t[m_t], return_af=False)
57 | if return_af:
58 | af, ma_samples, ma_counts = get_allele_stats(genotypes_t)
59 | return *res, af, ma_samples, ma_counts
60 | else:
61 | return res
62 |
63 | elif mode.startswith('multi'):
64 | X_t = torch.cat([torch.ones([m_t.sum(), 1], dtype=bool).to(genotypes_t.device), genotypes_t[:, m_t].T / 2], axis=1)
65 | b_t, b_se_t = linreg(X_t, y_t[m_t], dtype=torch.float32)
66 | return b_t[1:], b_se_t[1:]
67 |
--------------------------------------------------------------------------------
/tensorqtl/pgen.py:
--------------------------------------------------------------------------------
1 | # Functions for reading dosages from PLINK pgen files based on the Pgenlib Python API:
2 | # https://github.com/chrchang/plink-ng/blob/master/2.0/Python/python_api.txt
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import pgenlib as pg
7 | import os
8 | import bisect
9 |
10 |
11 | def read_pvar(pvar_path):
12 | """Read pvar file as pd.DataFrame"""
13 | return pd.read_csv(pvar_path, sep='\t', comment='#',
14 | names=['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'],
15 | dtype={'chrom':str, 'pos':np.int32, 'id':str, 'ref':str, 'alt':str,
16 | 'qual':str, 'filter':str, 'info':str})
17 |
18 |
19 | def read_psam(psam_path):
20 | """Read psam file as pd.DataFrame"""
21 | psam_df = pd.read_csv(psam_path, sep='\t', index_col=0)
22 | psam_df.index = psam_df.index.astype(str)
23 | return psam_df
24 |
25 |
26 | def hardcall_phase_present(pgen_path):
27 | """Returns True iff phased hardcalls may be present"""
28 | with pg.PgenReader(pgen_path.encode()) as r:
29 | return r.hardcall_phase_present()
30 |
31 |
32 | def get_reader(pgen_path, sample_subset=None):
33 | """"""
34 | if sample_subset is not None:
35 | sample_subset = np.array(sample_subset, dtype=np.uint32)
36 | reader = pg.PgenReader(pgen_path.encode(), sample_subset=sample_subset)
37 | if sample_subset is None:
38 | num_samples = reader.get_raw_sample_ct()
39 | else:
40 | num_samples = len(sample_subset)
41 | return reader, num_samples
42 |
43 |
44 | def read(pgen_path, variant_idx, sample_subset=None, dtype=np.int8):
45 | """
46 | Get genotypes for a variant.
47 |
48 | Parameters
49 | ----------
50 | pgen_path : str
51 | Path of PLINK 2 pgen file
52 | variant_idx : int
53 | Variant index
54 | sample_subset : array_like
55 | List of sample indexes to select. Must be sorted.
56 | dtype : np.int{8,32,64}
57 | Data type of the returned array.
58 |
59 | Returns
60 | -------
61 | dosages : ndarray
62 | Genotypes (as {0, 1, 2, -9}) for the selected variant and samples.
63 | """
64 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
65 | genotypes = np.zeros(num_samples, dtype=dtype)
66 | with reader as r:
67 | r.read(np.array(variant_idx, dtype=np.uint32), genotypes)
68 | return genotypes
69 |
70 |
71 | def read_dosages(pgen_path, variant_idx, sample_subset=None, dtype=np.float32):
72 | """
73 | Get dosages for a variant.
74 |
75 | Parameters
76 | ----------
77 | pgen_path : str
78 | Path of PLINK 2 pgen file
79 | variant_idx : int
80 | Variant index
81 | sample_subset : array_like
82 | List of sample indexes to select. Must be sorted.
83 | dtype : np.float{32,64}
84 | Data type of the returned array.
85 |
86 | Returns
87 | -------
88 | dosages : ndarray
89 | Genotype dosages for the selected variant and samples.
90 | """
91 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
92 | dosages = np.zeros(num_samples, dtype=dtype)
93 | with reader as r:
94 | r.read_dosages(np.array(variant_idx, dtype=np.uint32), dosages)
95 | return dosages
96 |
97 |
98 | def read_alleles(pgen_path, variant_idx, sample_subset=None):
99 | """
100 | Get alleles for a variant.
101 |
102 | Parameters
103 | ----------
104 | pgen_path : str
105 | Path of PLINK 2 pgen file
106 | variant_idx : int
107 | Variant index
108 | sample_subset : array_like
109 | List of sample indexes to select. Must be sorted.
110 |
111 | Returns
112 | -------
113 | alleles: ndarray (2 * sample_ct)
114 | Alleles for the selected variant and samples.
115 | Elements 2n and 2n+1 correspond to sample n.
116 | Both elements are -9 for missing genotypes.
117 | If the genotype is unphased, the lower index appears first.
118 | """
119 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
120 | alleles = np.zeros(2*num_samples, dtype=np.int32)
121 | with reader as r:
122 | r.read_alleles(np.array(variant_idx, dtype=np.uint32), alleles)
123 | return alleles
124 |
125 |
126 | def read_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.int8):
127 | """
128 | Get genotypes for a list of variants.
129 |
130 | Parameters
131 | ----------
132 | pgen_path : str
133 | Path of PLINK 2 pgen file
134 | variant_idxs : array_like
135 | List of variant indexes
136 | sample_subset : array_like
137 | List of sample indexes to select. Must be sorted.
138 | dtype : np.int{8,32,64}
139 | Data type of the returned array.
140 |
141 | Returns
142 | -------
143 | dosages : ndarray
144 | Genotypes for the selected variants and samples.
145 | """
146 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
147 | num_variants = len(variant_idxs)
148 | genotypes = np.zeros([num_variants, num_samples], dtype=dtype)
149 | with reader as r:
150 | r.read_list(np.array(variant_idxs, dtype=np.uint32), genotypes)
151 | return genotypes
152 |
153 |
154 | def read_dosages_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.float32):
155 | """
156 | Get dosages for a list of variants.
157 |
158 | Parameters
159 | ----------
160 | pgen_path : str
161 | Path of PLINK 2 pgen file
162 | variant_idxs : array_like
163 | List of variant indexes
164 | sample_subset : array_like
165 | List of sample indexes to select. Must be sorted.
166 | dtype : np.float{32,64}
167 | Data type of the returned array.
168 |
169 | Returns
170 | -------
171 | dosages : ndarray
172 | Genotype dosages for the selected variants and samples.
173 | """
174 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
175 | num_variants = len(variant_idxs)
176 | dosages = np.zeros([num_variants, num_samples], dtype=dtype)
177 | with reader as r:
178 | r.read_dosages_list(np.array(variant_idxs, dtype=np.uint32), dosages)
179 | return dosages
180 |
181 |
182 | def read_alleles_list(pgen_path, variant_idxs, sample_subset=None):
183 | """
184 | Get alleles for a list of variants.
185 |
186 | Parameters
187 | ----------
188 | pgen_path : str
189 | Path of PLINK 2 pgen file
190 | variant_idxs : array_like
191 | List of variant indexes
192 | sample_subset : array_like
193 | List of sample indexes to select. Must be sorted.
194 |
195 | Returns
196 | -------
197 | alleles : ndarray
198 | Alleles for the selected variants and samples.
199 | """
200 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
201 | num_variants = len(variant_idxs)
202 | alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32)
203 | with reader as r:
204 | r.read_alleles_list(np.array(variant_idxs, dtype=np.uint32), alleles)
205 | return alleles
206 |
207 |
208 | def read_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.int8):
209 | """
210 | Get genotypes for a range of variants.
211 |
212 | Parameters
213 | ----------
214 | pgen_path : str
215 | Path of PLINK 2 pgen file
216 | start_idx : int
217 | Start index of the range to query.
218 | end_idx : int
219 | End index of the range to query (inclusive).
220 | sample_subset : array_like
221 | List of sample indexes to select. Must be sorted.
222 | dtype : np.int{8,32,64}
223 | Data type of the returned array.
224 |
225 | Returns
226 | -------
227 | dosages : ndarray
228 | Genotypes for the selected variants and samples.
229 | """
230 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
231 | num_variants = end_idx - start_idx + 1
232 | genotypes = np.zeros([num_variants, num_samples], dtype=dtype)
233 | with reader as r:
234 | r.read_range(start_idx, end_idx+1, genotypes)
235 | return genotypes
236 |
237 |
238 | def read_dosages_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.float32):
239 | """
240 | Get dosages for a range of variants.
241 |
242 | Parameters
243 | ----------
244 | pgen_path : str
245 | Path of PLINK 2 pgen file
246 | start_idx : int
247 | Start index of the range to query.
248 | end_idx : int
249 | End index of the range to query (inclusive).
250 | sample_subset : array_like
251 | List of sample indexes to select. Must be sorted.
252 | dtype : np.float{32,64}
253 | Data type of the returned array.
254 |
255 | Returns
256 | -------
257 | dosages : ndarray
258 | Genotype dosages for the selected variants and samples.
259 | """
260 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
261 | num_variants = end_idx - start_idx + 1
262 | dosages = np.zeros([num_variants, num_samples], dtype=dtype)
263 | with reader as r:
264 | r.read_dosages_range(start_idx, end_idx+1, dosages)
265 | return dosages
266 |
267 |
268 | def read_alleles_range(pgen_path, start_idx, end_idx, sample_subset=None):
269 | """
270 | Get alleles for a range of variants.
271 |
272 | Parameters
273 | ----------
274 | pgen_path : str
275 | Path of PLINK 2 pgen file
276 | start_idx : int
277 | Start index of the range to query.
278 | end_idx : int
279 | End index of the range to query (inclusive).
280 | sample_subset : array_like
281 | List of sample indexes to select. Must be sorted.
282 |
283 | Returns
284 | -------
285 | alleles : ndarray
286 | Alleles for the selected variants and samples.
287 | """
288 | reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
289 | num_variants = end_idx - start_idx + 1
290 | alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32)
291 | with reader as r:
292 | r.read_alleles_range(start_idx, end_idx+1, alleles)
293 | return alleles
294 |
295 |
296 | def _impute_mean(genotypes):
297 | """Impute missing genotypes to mean"""
298 | m = genotypes == -9
299 | if genotypes.ndim == 1 and any(m):
300 | genotypes[m] = genotypes[~m].mean()
301 | else: # genotypes.ndim == 2
302 | ix = np.nonzero(m)[0]
303 | if len(ix) > 0:
304 | a = genotypes.sum(1)
305 | b = m.sum(1)
306 | mu = (a + 9*b) / (genotypes.shape[1] - b)
307 | genotypes[m] = mu[ix]
308 |
309 |
310 | class PgenReader(object):
311 | """
312 | Class for reading genotype data from PLINK 2 pgen files
313 |
314 | To generate the pgen/psam/pvar files from a VCF, run
315 | plink2 --vcf ${vcf_file} --output-chr chrM --out ${plink_prefix_path}
316 | To use dosages, run:
317 | plink2 --vcf ${vcf_file} 'dosage=DS' --output-chr chrM --out ${plink_prefix_path}
318 |
319 | Requires pgenlib: https://github.com/chrchang/plink-ng/tree/master/2.0/Python
320 | """
321 | def __init__(self, plink_prefix_path, select_samples=None):
322 | """
323 | plink_prefix_path: prefix to PLINK pgen,psam,pvar files
324 | select_samples: specify a subset of samples
325 | """
326 |
327 | if os.path.exists(f"{plink_prefix_path}.pvar.parquet"):
328 | self.pvar_df = pd.read_parquet(f"{plink_prefix_path}.pvar.parquet")
329 | else:
330 | self.pvar_df = read_pvar(f"{plink_prefix_path}.pvar")
331 | self.psam_df = read_psam(f"{plink_prefix_path}.psam")
332 | self.pgen_file = f"{plink_prefix_path}.pgen"
333 |
334 | self.num_variants = self.pvar_df.shape[0]
335 | self.variant_ids = self.pvar_df['id'].tolist()
336 | self.variant_idx_dict = {i:k for k,i in enumerate(self.variant_ids)}
337 |
338 | self.sample_id_list = self.psam_df.index.tolist()
339 | self.set_samples(select_samples)
340 |
341 | variant_df = self.pvar_df.set_index('id')[['chrom', 'pos']]
342 | variant_df['index'] = np.arange(variant_df.shape[0])
343 | self.variant_df = variant_df
344 | self.variant_dfs = {c:g[['pos', 'index']] for c,g in variant_df.groupby('chrom', sort=False)}
345 |
346 | def set_samples(self, sample_ids=None, sort=True):
347 | """
348 | Set samples to load.
349 |
350 | Parameters
351 | ----------
352 | sample_ids : array_like
353 | List of samples to select.
354 | sort : bool
355 | Preserve sample order from pgen file.
356 | """
357 | if sample_ids is None:
358 | self.sample_ids = self.sample_id_list
359 | self.sample_idxs = None
360 | else:
361 | sample_idxs = [self.sample_id_list.index(i) for i in sample_ids]
362 | if sort:
363 | sidx = np.argsort(sample_idxs)
364 | sample_idxs = [sample_idxs[i] for i in sidx]
365 | sample_ids = [sample_ids[i] for i in sidx]
366 | self.sample_ids = sample_ids
367 | self.sample_idxs = sample_idxs
368 |
369 | def get_range(self, region, start=None, end=None):
370 | """
371 | Get variant indexes corresponding to region specified as 'chr:start-end', or as chr, start, end.
372 |
373 | Parameters
374 | ----------
375 | region : str
376 | Genomic region, defined as 'chr:start-end' (1-based, inclusive), or chromosome.
377 | start : int
378 | Start position of the genomic interval (if chromosome is provided in fist argument).
379 | end : int
380 | End position of the genomic interval (if chromosome is provided in fist argument).
381 |
382 | Returns
383 | -------
384 | indexes : ndarray
385 | [start, end] indexes (inclusive)
386 | """
387 | if start is None and end is None:
388 | if ':' in region:
389 | chrom, pos = region.split(':')
390 | start, end = [int(i) for i in pos.split('-')]
391 | else: # full chromosome selected
392 | chrom = region
393 | return self.variant_dfs[chrom]['index'].values[[0, -1]]
394 | else: # input is chr, start, end
395 | chrom = region
396 |
397 | lb = bisect.bisect_left(self.variant_dfs[chrom]['pos'].values, start)
398 | ub = bisect.bisect_right(self.variant_dfs[chrom]['pos'].values, end)
399 | if lb != ub:
400 | r = self.variant_dfs[chrom]['index'].values[[lb, ub - 1]]
401 | else:
402 | r = []
403 | return r
404 |
405 | def read(self, variant_id, impute_mean=True, dtype=np.float32):
406 | """Read genotypes for an individual variant as 0,1,2,-9; impute missing values (-9) to mean (default)."""
407 | variant_idx = self.variant_idx_dict[variant_id]
408 | genotypes = read(self.pgen_file, variant_idx, sample_subset=self.sample_idxs,
409 | dtype=np.int8).astype(dtype)
410 | if impute_mean:
411 | _impute_mean(genotypes)
412 | return pd.Series(genotypes, index=self.sample_ids, name=variant_id)
413 |
414 | def read_list(self, variant_ids, impute_mean=True, dtype=np.float32):
415 | """Read genotypes for an list of variants as 0,1,2,-9; impute missing values (-9) to mean (default)."""
416 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
417 | genotypes = read_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs,
418 | dtype=np.int8).astype(dtype)
419 | if impute_mean:
420 | _impute_mean(genotypes)
421 | return pd.DataFrame(genotypes, index=variant_ids, columns=self.sample_ids)
422 |
423 | def read_range(self, start_idx, end_idx, impute_mean=True, dtype=np.float32):
424 | """Read genotypes for range of variants as 0,1,2,-9; impute missing values (-9) to mean (default)."""
425 | genotypes = read_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs,
426 | dtype=np.int8).astype(dtype)
427 | if impute_mean:
428 | _impute_mean(genotypes)
429 | return pd.DataFrame(genotypes, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
430 |
431 | def read_region(self, region, start_pos=None, end_pos=None, impute_mean=True, dtype=np.float32):
432 | """Read genotypes for variants in a genomic region as 0,1,2,-9; impute missing values (-9) to mean (default)."""
433 | r = self.get_range(region, start_pos, end_pos)
434 | if len(r) > 0:
435 | return self.read_range(*r, impute_mean=impute_mean, dtype=dtype)
436 |
437 | def read_dosages(self, variant_id, dtype=np.float32):
438 | variant_idx = self.variant_idx_dict[variant_id]
439 | dosages = read_dosages(self.pgen_file, variant_idx, sample_subset=self.sample_idxs, dtype=dtype)
440 | return pd.Series(dosages, index=self.sample_ids, name=variant_id)
441 |
442 | def read_dosages_list(self, variant_ids, dtype=np.float32):
443 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
444 | dosages = read_dosages_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs, dtype=dtype)
445 | return pd.DataFrame(dosages, index=variant_ids, columns=self.sample_ids)
446 |
447 | def read_dosages_range(self, start_idx, end_idx, dtype=np.float32):
448 | dosages = read_dosages_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs, dtype=dtype)
449 | return pd.DataFrame(dosages, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
450 |
451 | def read_dosages_region(self, region, start_pos=None, end_pos=None, dtype=np.float32):
452 | r = self.get_range(region, start_pos, end_pos)
453 | if len(r) > 0:
454 | return self.read_dosages_range(*r, dtype=dtype)
455 |
456 | def read_alleles(self, variant_id):
457 | variant_idx = self.variant_idx_dict[variant_id]
458 | alleles = read_alleles(self.pgen_file, variant_idx, sample_subset=self.sample_idxs)
459 | s1 = pd.Series(alleles[::2], index=self.sample_ids, name=variant_id)
460 | s2 = pd.Series(alleles[1::2], index=self.sample_ids, name=variant_id)
461 | return s1, s2
462 |
463 | def read_alleles_list(self, variant_ids):
464 | variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
465 | alleles = read_alleles_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs)
466 | df1 = pd.DataFrame(alleles[:,::2], index=variant_ids, columns=self.sample_ids)
467 | df2 = pd.DataFrame(alleles[:,1::2], index=variant_ids, columns=self.sample_ids)
468 | return df1, df2
469 |
470 | def read_alleles_range(self, start_idx, end_idx):
471 | alleles = read_alleles_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs)
472 | df1 = pd.DataFrame(alleles[:,::2], index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
473 | df2 = pd.DataFrame(alleles[:,1::2], index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
474 | return df1, df2
475 |
476 | def read_alleles_region(self, region, start_pos=None, end_pos=None):
477 | r = self.get_range(region, start_pos, end_pos)
478 | if len(r) > 0:
479 | return self.read_alleles_range(*r)
480 | else:
481 | return None, None
482 |
483 | def load_genotypes(self):
484 | """Load all genotypes as np.int8, without imputing missing values."""
485 | genotypes = read_range(self.pgen_file, 0, self.num_variants-1, sample_subset=self.sample_idxs)
486 | return pd.DataFrame(genotypes, index=self.variant_ids, columns=self.sample_ids)
487 |
488 | def load_dosages(self):
489 | """Load all dosages."""
490 | return self.read_dosages_range(0, self.num_variants-1)
491 |
492 | def load_alleles(self):
493 | """Load all alleles."""
494 | return self.read_alleles_range(0, self.num_variants-1)
495 |
496 | def get_pairwise_ld(self, id1, id2, r2=True, dtype=np.float32):
497 | """Compute pairwise LD (R2) between (lists of) variants"""
498 | if isinstance(id1, str) and isinstance(id2, str):
499 | g1 = self.read(id1, dtype=dtype)
500 | g2 = self.read(id2, dtype=dtype)
501 | g1 -= g1.mean()
502 | g2 -= g2.mean()
503 | if r2:
504 | r = (g1 * g2).sum()**2 / ( (g1**2).sum() * (g2**2).sum() )
505 | else:
506 | r = (g1 * g2).sum() / np.sqrt( (g1**2).sum() * (g2**2).sum() )
507 | elif isinstance(id1, str):
508 | g1 = self.read(id1, dtype=dtype)
509 | g2 = self.read_list(id2, dtype=dtype)
510 | g1 -= g1.mean()
511 | g2 -= g2.values.mean(1, keepdims=True)
512 | if r2:
513 | r = (g1 * g2).sum(1)**2 / ( (g1**2).sum() * (g2**2).sum(1) )
514 | else:
515 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum() * (g2**2).sum(1) )
516 | elif isinstance(id2, str):
517 | g1 = self.read_list(id1, dtype=dtype)
518 | g2 = self.read(id2, dtype=dtype)
519 | g1 -= g1.values.mean(1, keepdims=True)
520 | g2 -= g2.mean()
521 | if r2:
522 | r = (g1 * g2).sum(1)**2 / ( (g1**2).sum(1) * (g2**2).sum() )
523 | else:
524 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum() )
525 | else:
526 | assert len(id1) == len(id2)
527 | g1 = self.read_list(id1, dtype=dtype).values
528 | g2 = self.read_list(id2, dtype=dtype).values
529 | g1 -= g1.mean(1, keepdims=True)
530 | g2 -= g2.mean(1, keepdims=True)
531 | if r2:
532 | r = (g1 * g2).sum(1) ** 2 / ( (g1**2).sum(1) * (g2**2).sum(1) )
533 | else:
534 | r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum(1) )
535 | return r
536 |
537 | def get_ld_matrix(self, variant_ids, dtype=np.float32):
538 | g = self.read_list(variant_ids, dtype=dtype).values
539 | return pd.DataFrame(np.corrcoef(g), index=variant_ids, columns=variant_ids)
540 |
541 |
542 | def load_dosages_df(plink_prefix_path, select_samples=None):
543 | """
544 | Load dosages for all variants and all/selected samples as a dataframe.
545 |
546 | Parameters
547 | ----------
548 | plink_prefix_path : str
549 | Prefix to .pgen/.psam/.pvar files
550 | select_samples : array_like
551 | List of sample IDs to select. Default: all samples.
552 |
553 | Returns
554 | -------
555 | dosages_df : pd.DataFrame (variants x samples)
556 | Genotype dosages for the selected samples.
557 | """
558 | p = Pgen(plink_prefix_path, select_samples=select_samples)
559 | return p.load_dosages_df()
560 |
--------------------------------------------------------------------------------
/tensorqtl/post.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import torch
4 | import scipy.stats as stats
5 | import subprocess
6 | import sys
7 | import os
8 | import glob
9 | from datetime import datetime
10 |
11 | sys.path.insert(1, os.path.dirname(__file__))
12 | from core import *
13 | import mixqtl
14 | import qtl.genotype as gt
15 |
16 |
17 | def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None):
18 | """Annotate permutation results with q-values, p-value threshold"""
19 | if logger is None:
20 | logger = SimpleLogger()
21 |
22 | logger.write('Computing q-values')
23 | logger.write(f' * Number of phenotypes tested: {res_df.shape[0]}')
24 |
25 | if not res_df['pval_beta'].isnull().all():
26 | pval_col = 'pval_beta'
27 | r = stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0]
28 | logger.write(f' * Correlation between Beta-approximated and empirical p-values: {r:.4f}')
29 | else:
30 | pval_col = 'pval_perm'
31 | logger.write(f' * WARNING: no beta-approximated p-values found, using permutation p-values instead.')
32 |
33 | # calculate q-values
34 | if qvalue_lambda is not None:
35 | logger.write(f' * Calculating q-values with lambda = {qvalue_lambda:.3f}')
36 | qval, pi0 = rfunc.qvalue(res_df[pval_col], lambda_qvalue=qvalue_lambda)
37 |
38 | res_df['qval'] = qval
39 | logger.write(f' * Proportion of significant phenotypes (1-pi0): {1-pi0:.2f}')
40 | logger.write(f" * QTL phenotypes @ FDR {fdr:.2f}: {(res_df['qval'] <= fdr).sum()}")
41 |
42 | # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
43 | if pval_col == 'pval_beta':
44 | lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values()
45 | ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values()
46 |
47 | if len(lb) > 0: # significant phenotypes
48 | lb = lb.iloc[-1]
49 | if len(ub) > 0:
50 | ub = ub.iloc[0]
51 | pthreshold = (lb+ub)/2
52 | else:
53 | pthreshold = lb
54 | logger.write(f' * min p-value threshold @ FDR {fdr}: {pthreshold:.6g}')
55 | res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
56 |
57 |
58 | def calculate_afc(assoc_df, counts_df, genotype_df, variant_df=None, covariates_df=None,
59 | select_covariates=True, group='gene_id',
60 | imputation='offset', count_threshold=0, verbose=True):
61 | """
62 | Calculate allelic fold-change (aFC) for variant-gene pairs
63 |
64 | Inputs
65 | assoc_df: dataframe containing variant-gene associations, must have 'gene_id'
66 | and 'variant_id' columns. If multiple variants/gene are detected, effects
67 | are estimated jointly.
68 | genotype_df: genotype dosages
69 | counts_df: read counts scaled with DESeq size factors. Zeros are imputed using
70 | log(counts + 1) (imputation='offset'; default) or with half-minimum
71 | (imputation='half_min').
72 | covariates_df: covariates (genotype PCs, PEER factors, etc.)
73 |
74 | aFC [1] is computed using the total read count (trc) model from mixQTL [2].
75 |
76 | [1] Mohammadi et al., 2017 (genome.cshlp.org/content/27/11/1872)
77 | [2] Liang et al., 2021 (10.1038/s41467-021-21592-8)
78 | """
79 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80 |
81 | if variant_df is not None:
82 | gi = gt.GenotypeIndexer(genotype_df, variant_df)
83 | else:
84 | assert isinstance(genotype_df, gt.GenotypeIndexer)
85 | gi = genotype_df
86 | genotype_ix = np.array([gi.genotype_df.columns.tolist().index(i) for i in counts_df.columns])
87 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
88 |
89 | if covariates_df is not None:
90 | covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
91 | else:
92 | covariates_t = None
93 |
94 | afc_df = []
95 | n = len(assoc_df[group].unique())
96 | for k, (phenotype_id, gdf) in enumerate(assoc_df.groupby(group, sort=False), 1):
97 | if verbose and k % 10 == 0 or k == n:
98 | print(f"\rCalculating aFC for {group.replace('_id','')} {k}/{n}", end='' if k != n else None, flush=True)
99 |
100 | counts_t = torch.tensor(counts_df.loc[phenotype_id].values,
101 | dtype=torch.float32).to(device)
102 | genotypes_t = torch.tensor(gi.get_genotypes(gdf['variant_id'].tolist()), dtype=torch.float32).to(device)
103 | genotypes_t = genotypes_t[:,genotype_ix_t]
104 | impute_mean(genotypes_t)
105 | try:
106 | b, b_se = mixqtl.trc(genotypes_t, counts_t, covariates_t=covariates_t,
107 | select_covariates=select_covariates, count_threshold=count_threshold,
108 | imputation=imputation, mode='multi', return_af=False)
109 | gdf['afc'] = b.cpu().numpy() * np.log2(np.e)
110 | gdf['afc_se'] = b_se.cpu().numpy() * np.log2(np.e)
111 | afc_df.append(gdf)
112 | except:
113 | print(f'WARNING: aFC calculation failed for {phenotype_id}')
114 | afc_df = pd.concat(afc_df)
115 |
116 | return afc_df
117 |
118 |
119 | def calculate_replication(res_df, genotypes, phenotype_df, covariates_df=None, paired_covariate_df=None,
120 | interaction_s=None, compute_pi1=False, lambda_qvalue=None, logp=False):
121 | """res_df: DataFrame with 'variant_id' column and phenotype IDs as index"""
122 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
123 |
124 | if paired_covariate_df is not None:
125 | assert paired_covariate_df.index.equals(covariates_df.index)
126 | assert paired_covariate_df.columns.isin(phenotype_df.index).all()
127 |
128 | if isinstance(genotypes, pd.DataFrame):
129 | genotypes_t = torch.tensor(genotypes.loc[res_df['variant_id']].values, dtype=torch.float).to(device)
130 | genotype_ix = np.array([genotypes.columns.tolist().index(i) for i in phenotype_df.columns])
131 | else: # pgen.PgenReader
132 | gt_df = genotypes.read_list(res_df['variant_id'], impute_mean=False)
133 | genotypes_t = torch.tensor(gt_df.values, dtype=torch.float).to(device)
134 | genotype_ix = np.array([gt_df.columns.tolist().index(i) for i in phenotype_df.columns])
135 |
136 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
137 | genotypes_t = genotypes_t[:,genotype_ix_t]
138 | impute_mean(genotypes_t)
139 | af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t)
140 |
141 | phenotypes_t = torch.tensor(phenotype_df.loc[res_df.index].values, dtype=torch.float32).to(device)
142 |
143 | if covariates_df is not None:
144 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
145 | # dof -= covariates_df.shape[1]
146 | else:
147 | residualizer = None
148 |
149 | if interaction_s is None:
150 | if paired_covariate_df is None:
151 | if residualizer is not None:
152 | genotype_res_t = residualizer.transform(genotypes_t) # variants x samples
153 | phenotype_res_t = residualizer.transform(phenotypes_t) # phenotypes x samples
154 | dof = residualizer.dof
155 | dof_t = dof
156 | else:
157 | genotype_res_t = genotypes_t
158 | phenotype_res_t = phenotypes_t
159 | dof = phenotypes_t.shape[1] - 2
160 | dof_t = dof
161 | else:
162 | genotype_res_t = torch.zeros_like(genotypes_t).to(device)
163 | phenotype_res_t = torch.zeros_like(phenotypes_t).to(device)
164 | dof = []
165 | for k,phenotype_id in enumerate(res_df.index):
166 | if phenotype_id in paired_covariate_df:
167 | iresidualizer = Residualizer(torch.tensor(np.c_[covariates_df, paired_covariate_df[phenotype_id]],
168 | dtype=torch.float32).to(device))
169 | else:
170 | iresidualizer = residualizer
171 | genotype_res_t[[k]] = iresidualizer.transform(genotypes_t[[k]])
172 | phenotype_res_t[[k]] = iresidualizer.transform(phenotypes_t[[k]])
173 | dof.append(iresidualizer.dof)
174 | dof = np.array(dof)
175 | dof_t = torch.Tensor(dof).to(device)
176 |
177 | gstd = genotype_res_t.var(1)
178 | pstd = phenotype_res_t.var(1)
179 | std_ratio_t = torch.sqrt(pstd / gstd)
180 |
181 | # center and normalize
182 | genotype_res_t = center_normalize(genotype_res_t, dim=1)
183 | phenotype_res_t = center_normalize(phenotype_res_t, dim=1)
184 |
185 | r_nominal_t = (genotype_res_t * phenotype_res_t).sum(1)
186 | r2_nominal_t = r_nominal_t.double().pow(2)
187 |
188 | tstat_t = torch.sqrt((dof_t * r2_nominal_t) / (1 - r2_nominal_t))
189 | slope_t = r_nominal_t * std_ratio_t
190 | slope_se_t = (slope_t.abs().double() / tstat_t).float()
191 | pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof)
192 |
193 | rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(), pval, slope_t.cpu(), slope_se_t.cpu()],
194 | columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af', 'pval_nominal', 'slope', 'slope_se']).infer_objects()
195 |
196 | else:
197 | if paired_covariate_df is not None:
198 | raise NotImplementedError("Paired covariates are not yet supported for interactions")
199 |
200 | interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device)
201 | ng, ns = genotypes_t.shape
202 | nps = phenotypes_t.shape[0]
203 |
204 | # centered inputs
205 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
206 | gi_t = genotypes_t * interaction_t
207 | gi0_t = gi_t - gi_t.mean(1, keepdim=True)
208 | i0_t = interaction_t - interaction_t.mean()
209 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)
210 |
211 | # residualize rows
212 | g0_t = residualizer.transform(g0_t, center=False)
213 | gi0_t = residualizer.transform(gi0_t, center=False)
214 | p0_t = residualizer.transform(p0_t, center=False) # np x ns
215 | i0_t = residualizer.transform(i0_t, center=False)
216 | i0_t = i0_t.repeat(ng, 1)
217 |
218 | # regression (in float; loss of precision may occur in edge cases)
219 | X_t = torch.stack([g0_t, i0_t, gi0_t], 2) # ng x ns x 3
220 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3
221 | b_t = (torch.matmul(Xinv, torch.transpose(X_t, 1, 2)) * p0_t.unsqueeze(1)).sum(2) # ng x 3
222 | r_t = (X_t * b_t.unsqueeze(1)).sum(2) - p0_t
223 | dof = residualizer.dof - 2
224 | rss_t = (r_t*r_t).sum(1) # ng x np
225 | b_se_t = torch.sqrt( Xinv[:, torch.eye(3, dtype=torch.uint8).bool()] * rss_t.unsqueeze(-1) / dof )
226 | tstat_t = (b_t.double() / b_se_t.double()).float()
227 | pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof)
228 | b = b_t.cpu()
229 | b_se = b_se_t.cpu()
230 |
231 | rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(),
232 | pval[:,0], b[:,0], b_se[:,0], pval[:,1], b[:,1], b_se[:,1], pval[:,2], b[:,2], b_se[:,2]],
233 | columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af',
234 | 'pval_g', 'b_g', 'b_g_se', 'pval_i', 'b_i', 'b_i_se', 'pval_gi', 'b_gi', 'b_gi_se']).infer_objects()
235 | pval = pval[:,2]
236 |
237 | if compute_pi1:
238 | try:
239 | pi1 = 1 - rfunc.pi0est(pval, lambda_qvalue=lambda_qvalue)[0]
240 | except:
241 | pi1 = np.nan
242 | return pi1, rep_df
243 | else:
244 | return rep_df
245 |
246 |
247 | def annotate_genes(gene_df, annotation_gtf, lookup_df=None):
248 | """
249 | Add gene and variant annotations (e.g., gene_name, rs_id, etc.) to gene-level output
250 |
251 | gene_df: output from map_cis()
252 | annotation_gtf: gene annotation in GTF format
253 | lookup_df: DataFrame with variant annotations, indexed by 'variant_id'
254 | """
255 | gene_dict = {}
256 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Adding gene and variant annotations', flush=True)
257 | print(' * parsing GTF', flush=True)
258 | with open(annotation_gtf) as gtf:
259 | for row in gtf:
260 | row = row.strip().split('\t')
261 | if row[0][0] == '#' or row[2] != 'gene': continue
262 | # get gene_id and gene_name from attributes
263 | attr = dict([i.split() for i in row[8].replace('"','').split(';') if i!=''])
264 | # gene_name, gene_chr, gene_start, gene_end, strand
265 | gene_dict[attr['gene_id']] = [attr['gene_name'], row[0], row[3], row[4], row[6]]
266 |
267 | print(' * annotating genes', flush=True)
268 | if 'group_id' in gene_df:
269 | gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df['group_id']],
270 | columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'],
271 | index=gene_df.index)
272 | else:
273 | gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df.index],
274 | columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'],
275 | index=gene_df.index)
276 | gene_df = pd.concat([gene_info, gene_df], axis=1)
277 | assert np.all(gene_df.index == gene_info.index)
278 |
279 | col_order = ['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand',
280 | 'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id']
281 | if 'tss_distance' in gene_df:
282 | col_order += ['tss_distance']
283 | else:
284 | col_order += ['start_distance', 'end_distance']
285 | if lookup_df is not None:
286 | print(' * adding variant annotations from lookup table', flush=True)
287 | gene_df = gene_df.join(lookup_df, on='variant_id') # add variant information
288 | col_order += list(lookup_df.columns)
289 | col_order += ['ma_samples', 'ma_count', 'af', 'pval_nominal',
290 | 'slope', 'slope_se', 'pval_perm', 'pval_beta']
291 | if 'group_id' in gene_df:
292 | col_order += ['group_id', 'group_size']
293 | col_order += ['qval', 'pval_nominal_threshold']
294 | gene_df = gene_df[col_order]
295 | print('done.', flush=True)
296 | return gene_df
297 |
298 |
299 | def get_significant_pairs(res_df, nominal_files, group_s=None, fdr=0.05):
300 | """Significant variant-phenotype pairs based on nominal p-value threshold for each phenotype"""
301 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] tensorQTL: parsing all significant variant-phenotype pairs', flush=True)
302 | assert 'qval' in res_df
303 |
304 | # significant phenotypes (apply FDR threshold)
305 | if group_s is not None:
306 | df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta', 'group_id']].copy()
307 | df.set_index('group_id', inplace=True)
308 | else:
309 | df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta']].copy()
310 | df.rename(columns={'pval_nominal': 'min_pval_nominal'}, inplace=True)
311 | signif_phenotype_ids = set(df.index)
312 | threshold_dict = df['pval_nominal_threshold'].to_dict()
313 |
314 | if isinstance(nominal_files, str):
315 | # chr -> file
316 | nominal_files = {os.path.basename(i).split('.')[-2]:i for i in glob.glob(nominal_files+'*.parquet')}
317 | else:
318 | assert isinstance(nominal_files, dict)
319 |
320 | chroms = sorted(nominal_files.keys(), key=lambda x: int(x.replace('chr', '').replace('X', '23')))
321 | signif_df = []
322 | for k,c in enumerate(chroms, 1):
323 | print(f' * processing chr. {k}/{len(chroms)}', end='\r', flush=True)
324 | nominal_df = pd.read_parquet(nominal_files[c])
325 | # drop pairs that never pass threshold
326 | nominal_df = nominal_df[nominal_df['pval_nominal'] <= df['pval_nominal_threshold'].max()]
327 | if group_s is not None:
328 | nominal_df.insert(1, 'group_id', nominal_df['phenotype_id'].map(group_s))
329 | nominal_df = nominal_df[nominal_df['group_id'].isin(signif_phenotype_ids)]
330 | m = nominal_df['pval_nominal'] < nominal_df['group_id'].apply(lambda x: threshold_dict[x])
331 | else:
332 | nominal_df = nominal_df[nominal_df['phenotype_id'].isin(signif_phenotype_ids)]
333 | m = nominal_df['pval_nominal'] < nominal_df['phenotype_id'].apply(lambda x: threshold_dict[x])
334 | signif_df.append(nominal_df[m])
335 | print()
336 | signif_df = pd.concat(signif_df, axis=0)
337 | if group_s is not None:
338 | signif_df = signif_df.merge(df, left_on='group_id', right_index=True)
339 | else:
340 | signif_df = signif_df.merge(df, left_on='phenotype_id', right_index=True)
341 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] done', flush=True)
342 | return signif_df.reset_index(drop=True)
343 |
--------------------------------------------------------------------------------
/tensorqtl/rfunc.py:
--------------------------------------------------------------------------------
1 | # Author: Francois Aguet
2 | import numpy as np
3 | import rpy2
4 | from rpy2.robjects.packages import importr
5 | from collections.abc import Iterable
6 | from contextlib import contextmanager
7 |
8 | # silence R warnings
9 | from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
10 | import logging
11 | rpy2_logger.setLevel(logging.ERROR)
12 |
13 | @contextmanager
14 | def suppress_stdout():
15 | with open(os.devnull, "w") as devnull:
16 | old_stdout = sys.stdout
17 | sys.stdout = devnull
18 | try:
19 | yield
20 | finally:
21 | sys.stdout = old_stdout
22 |
23 |
24 | def p_adjust(p, method='BH'):
25 | """Wrapper for p.adjust"""
26 | rp = rpy2.robjects.vectors.FloatVector(p)
27 | p_adjust = rpy2.robjects.r['p.adjust']
28 | return np.array(p_adjust(rp, method=method))
29 |
30 |
31 | def t_cdf(t, df, lower_tail=False, log=True):
32 | """Wrapper for pt"""
33 | scalar = True
34 | if isinstance(t, Iterable):
35 | rt = rpy2.robjects.vectors.FloatVector(t)
36 | scalar = False
37 | else:
38 | rt = t
39 | if isinstance(df, Iterable):
40 | rdf = rpy2.robjects.vectors.FloatVector(df)
41 | scalar = False
42 | else:
43 | rdf = df
44 | r_pt = rpy2.robjects.r['pt']
45 | res = np.array(r_pt(rt, rdf, lower_tail=lower_tail, log=log))
46 | if scalar:
47 | res = res[0]
48 | return res
49 |
50 |
51 | def qvalue(p, lambda_qvalue=None):
52 | """Wrapper for qvalue::qvalue"""
53 | qvalue = importr("qvalue")
54 | rp = rpy2.robjects.vectors.FloatVector(p)
55 | if lambda_qvalue is None:
56 | q = qvalue.qvalue(rp)
57 | else:
58 | if not isinstance(lambda_qvalue, Iterable):
59 | lambda_qvalue = [lambda_qvalue]
60 | rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue)
61 | q = qvalue.qvalue(rp, **{'lambda':rlambda})
62 | qval = np.array(q.rx2('qvalues'))
63 | pi0 = np.array(q.rx2('pi0'))[0]
64 | return qval, pi0
65 |
66 |
67 | def pi0est(p, lambda_qvalue=None):
68 | """Wrapper for qvalue::pi0est"""
69 | qvalue = importr("qvalue")
70 | rp = rpy2.robjects.vectors.FloatVector(p)
71 | # with suppress_stdout():
72 | if lambda_qvalue is None:
73 | pi0res = qvalue.pi0est(rp)
74 | else:
75 | if not isinstance(lambda_qvalue, Iterable):
76 | lambda_qvalue = [lambda_qvalue]
77 | rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue)
78 | pi0res = qvalue.pi0est(rp, rlambda)
79 | pi0 = np.array(pi0res.rx2('pi0'))[0]
80 | pi0_lambda = np.array(pi0res.rx2('pi0.lambda'))
81 | lambda_vec = np.array(pi0res.rx2('lambda'))
82 | pi0_smooth = np.array(pi0res.rx2('pi0.smooth'))
83 | return pi0, pi0_lambda, lambda_vec, pi0_smooth
84 |
--------------------------------------------------------------------------------
/tensorqtl/tensorqtl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from __future__ import print_function
3 | import pandas as pd
4 | import numpy as np
5 | from datetime import datetime
6 | import sys
7 | import os
8 | import re
9 | import pickle
10 | import argparse
11 | from collections import defaultdict
12 | import importlib.metadata
13 |
14 | sys.path.insert(1, os.path.dirname(__file__))
15 | from core import *
16 | from post import *
17 | import genotypeio, cis, trans, susie
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(description='tensorQTL: GPU-based QTL mapper')
22 | parser.add_argument('genotype_path', help='Genotypes in PLINK format')
23 | parser.add_argument('phenotypes', help="Phenotypes in BED format (.bed, .bed.gz, .bed.parquet), or optionally for 'trans' mode, parquet or tab-delimited.")
24 | parser.add_argument('prefix', help='Prefix for output file names')
25 | parser.add_argument('--mode', type=str, default='cis', choices=['cis', 'cis_nominal', 'cis_independent', 'cis_susie', 'trans', 'trans_susie'],
26 | help='Mapping mode. Default: cis')
27 | parser.add_argument('--covariates', default=None, help='Covariates file, tab-delimited (covariates x samples)')
28 | parser.add_argument('--paired_covariate', default=None, help='Single phenotype-specific covariate, tab-delimited (phenotypes x samples)')
29 | parser.add_argument('--permutations', type=int, default=10000, help='Number of permutations. Default: 10000')
30 | parser.add_argument('--interaction', default=None, type=str, help='Tab-delimited file mapping sample ID to interaction value(s) (if multiple interaction terms are used, the file must include a header with variable names)')
31 | parser.add_argument('--cis_output', default=None, type=str, help="Output from 'cis' mode with q-values. Required for independent cis-QTL mapping.")
32 | parser.add_argument('--phenotype_groups', default=None, type=str, help='Phenotype groups. Header-less TSV with two columns: phenotype_id, group_id')
33 | parser.add_argument('--window', default=1000000, type=np.int32, help='Cis-window size, in bases. Default: 1000000.')
34 | parser.add_argument('--pval_threshold', default=1e-5, type=np.float64, help='Output only significant phenotype-variant pairs with a p-value below threshold. Default: 1e-5 for trans-QTL')
35 | parser.add_argument('--logp', action='store_true', help='Compute nominal p-values as -log10(P) for added precision (requires R)')
36 | parser.add_argument('--maf_threshold', default=0, type=np.float64, help='Include only genotypes with minor allele frequency >= maf_threshold. Default: 0')
37 | parser.add_argument('--maf_threshold_interaction', default=0.05, type=np.float64, help='MAF threshold for interactions, applied to lower and upper half of samples')
38 | parser.add_argument('--dosages', action='store_true', help='Load dosages instead of genotypes (only applies to PLINK2 bgen input).')
39 | parser.add_argument('--return_dense', action='store_true', help='Return dense output for trans-QTL.')
40 | parser.add_argument('--return_r2', action='store_true', help='Return r2 (only for sparse trans-QTL output)')
41 | parser.add_argument('--best_only', action='store_true', help='Only write lead association for each phenotype (interaction mode only)')
42 | parser.add_argument('--output_text', action='store_true', help='Write output in txt.gz format instead of parquet (trans-QTL mode only)')
43 | parser.add_argument('--batch_size', type=int, default=20000, help='GPU batch size (trans-QTLs only). Reduce this if encountering OOM errors.')
44 | parser.add_argument('--chunk_size', default=None, help="For cis-QTL mapping, load genotypes into CPU memory in chunks of chunk_size variants, or by chromosome if chunk_size is 'chr'.")
45 | parser.add_argument('--susie_loci', default=None, help="Table (parquet or tsv) with loci to fine-map (phenotype_id, chr, pos) with mode 'trans_susie'.")
46 | parser.add_argument('--disable_beta_approx', action='store_true', help='Disable Beta-distribution approximation of empirical p-values (not recommended).')
47 | parser.add_argument('--warn_monomorphic', action='store_true', help='Warn if monomorphic variants are found.')
48 | parser.add_argument('--max_effects', type=int, default=10, help='Maximum number of non-zero effects in the SuSiE regression model.')
49 | parser.add_argument('--fdr', default=0.05, type=np.float64, help='FDR for cis-QTLs')
50 | parser.add_argument('--qvalue_lambda', default=None, type=np.float64, help='lambda parameter for pi0est in qvalue.')
51 | parser.add_argument('--seed', default=None, type=int, help='Seed for permutations.')
52 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
53 | args = parser.parse_args()
54 |
55 | # check inputs
56 | if args.mode == 'cis_independent' and (args.cis_output is None or not os.path.exists(args.cis_output)):
57 | raise ValueError("Output from 'cis' mode must be provided.")
58 | if args.interaction is not None and args.mode not in ['cis_nominal', 'trans']:
59 | raise ValueError("Interactions are only supported in 'cis_nominal' or 'trans' mode.")
60 |
61 | logger = SimpleLogger(os.path.join(args.output_dir, f'{args.prefix}.tensorQTL.{args.mode}.log'))
62 | logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Running TensorQTL v{importlib.metadata.version("tensorqtl")}: {args.mode.split("_")[0]}-QTL mapping')
63 | if torch.cuda.is_available():
64 | logger.write(f' * using GPU ({torch.cuda.get_device_name(torch.cuda.current_device())})')
65 | else:
66 | logger.write(' * WARNING: using CPU!')
67 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
68 | if args.seed is not None:
69 | logger.write(f' * using seed {args.seed}')
70 |
71 | # load inputs
72 | logger.write(f' * reading phenotypes ({args.phenotypes})')
73 | # for cis modes, require BED input with position information
74 | if args.mode.startswith('cis'):
75 | assert args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')), "For cis modes, phenotypes must be in BED format."
76 | phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes)
77 | if phenotype_pos_df.columns[1] == 'pos':
78 | logger.write(f" * cis-window detected as position ± {args.window:,}")
79 | else:
80 | logger.write(f" * cis-window detected as [start - {args.window:,}, end + {args.window:,}]")
81 | elif args.mode.startswith('trans'):
82 | if args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')):
83 | phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes)
84 | else:
85 | if args.phenotypes.endswith('.parquet'):
86 | phenotype_df = pd.read_parquet(args.phenotypes)
87 | else: # assume tab-delimited
88 | phenotype_df = pd.read_csv(args.phenotypes, sep='\t', index_col=0)
89 | phenotype_pos_df = None
90 |
91 | if args.covariates is not None:
92 | logger.write(f' * reading covariates ({args.covariates})')
93 | covariates_df = pd.read_csv(args.covariates, sep='\t', index_col=0).T
94 | assert phenotype_df.columns.equals(covariates_df.index)
95 | else:
96 | covariates_df = None
97 |
98 | if args.paired_covariate is not None:
99 | assert covariates_df is not None, f"Covariates matrix must be provided when using paired covariate"
100 | paired_covariate_df = pd.read_csv(args.paired_covariate, sep='\t', index_col=0) # phenotypes x samples
101 | assert paired_covariate_df.index.isin(phenotype_df.index).all(), f"Paired covariate phenotypes must be present in phenotype matrix."
102 | assert paired_covariate_df.columns.equals(phenotype_df.columns), f"Paired covariate samples must match samples in phenotype matrix."
103 | else:
104 | paired_covariate_df = None
105 |
106 | if args.interaction is not None:
107 | logger.write(f' * reading interaction term(s) ({args.interaction})')
108 | # allow headerless input for single interactions
109 | with open(args.interaction) as f:
110 | f.readline()
111 | s = f.readline().strip()
112 | if len(s.split('\t')) == 2: # index + value
113 | interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0, header=None)
114 | else:
115 | interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0)
116 | # select samples
117 | assert covariates_df.index.isin(interaction_df.index).all()
118 | interaction_df = interaction_df.loc[covariates_df.index].astype(np.float32)
119 | else:
120 | interaction_df = None
121 |
122 | if args.maf_threshold is None:
123 | if args.mode == 'trans':
124 | maf_threshold = 0.05
125 | else:
126 | maf_threshold = 0
127 | else:
128 | maf_threshold = args.maf_threshold
129 |
130 | if args.phenotype_groups is not None:
131 | group_s = pd.read_csv(args.phenotype_groups, sep='\t', index_col=0, header=None).squeeze('columns').rename(None)
132 | # verify sort order
133 | group_dict = group_s.to_dict()
134 | previous_group = ''
135 | parsed_groups = 0
136 | for i in phenotype_df.index:
137 | if group_dict[i] != previous_group:
138 | parsed_groups += 1
139 | previous_group = group_dict[i]
140 | if not parsed_groups == len(group_s.unique()):
141 | raise ValueError('Groups defined in input do not match phenotype file (check sort order).')
142 | else:
143 | group_s = None
144 |
145 | # load genotypes
146 | if args.chunk_size is None: # load all genotypes into memory
147 | logger.write(f' * loading genotype dosages' if args.dosages else f' * loading genotypes')
148 | genotype_df, variant_df = genotypeio.load_genotypes(args.genotype_path, select_samples=phenotype_df.columns, dosages=args.dosages)
149 | if variant_df is None:
150 | assert not args.mode.startswith('cis'), f"Genotype data without variant positions is only supported for mode='trans'."
151 | else:
152 | if not all([os.path.exists(f"{args.genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]):
153 | raise ValueError("Processing in chunks requires PLINK 2 pgen/psam/pvar files.")
154 | import pgen
155 | pgr = pgen.PgenReader(args.genotype_path, select_samples=phenotype_df.columns)
156 |
157 | if args.mode == 'cis':
158 | if args.chunk_size is None:
159 | res_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df,
160 | group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations,
161 | window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold,
162 | warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True)
163 | else:
164 | res_df = []
165 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
166 | dosages=args.dosages, verbose=True):
167 | res_df.append(cis.map_cis(gt_df, var_df, p_df, p_pos_df, covariates_df=covariates_df,
168 | group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations,
169 | window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold,
170 | warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True))
171 | res_df = pd.concat(res_df)
172 | logger.write(' * writing output')
173 | if has_rpy2:
174 | calculate_qvalues(res_df, fdr=args.fdr, qvalue_lambda=args.qvalue_lambda, logger=logger)
175 | out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.txt.gz')
176 | res_df.to_csv(out_file, sep='\t', float_format='%.6g')
177 |
178 | elif args.mode == 'cis_nominal':
179 | if args.chunk_size is None:
180 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, args.prefix, covariates_df=covariates_df,
181 | paired_covariate_df=paired_covariate_df, interaction_df=interaction_df,
182 | maf_threshold_interaction=args.maf_threshold_interaction,
183 | group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True,
184 | output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True)
185 | # compute significant pairs
186 | if args.cis_output is not None:
187 | cis_df = pd.read_csv(args.cis_output, sep='\t', index_col=0)
188 | nominal_prefix = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl_pairs')
189 | signif_df = get_significant_pairs(cis_df, nominal_prefix, group_s=group_s, fdr=args.fdr)
190 | signif_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.signif_pairs.parquet'))
191 |
192 | else:
193 | chunks = []
194 | for gt_df, var_df, p_df, p_pos_df, ci in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
195 | dosages=args.dosages, verbose=True):
196 | prefix = f"{args.prefix}.chunk{ci+1}"
197 | chunks.append(prefix)
198 | cis.map_nominal(gt_df, var_df, p_df, p_pos_df, prefix, covariates_df=covariates_df,
199 | paired_covariate_df=paired_covariate_df, interaction_df=interaction_df,
200 | maf_threshold_interaction=args.maf_threshold_interaction,
201 | group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True,
202 | output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True)
203 | chunk_files = glob.glob(os.path.join(args.output_dir, f"{args.prefix}.chunk*.cis_qtl_pairs.*.parquet"))
204 | if args.chunk_size == 'chr': # remove redundant chunk ID from file names
205 | for f in chunk_files:
206 | x = re.findall(f"{args.prefix}\.(chunk\d+)", os.path.basename(f))
207 | assert len(x) == 1
208 | os.rename(f, f.replace(f"{x[0]}.", ""))
209 | else: # concatenate outputs by chromosome
210 | chunk_df = pd.DataFrame({
211 | 'file': chunk_files,
212 | 'chunk': [int(re.findall(f"{args.prefix}\.chunk(\d+)", os.path.basename(i))[0]) for i in chunk_files],
213 | 'chr': [re.findall("\.cis_qtl_pairs\.(.*)\.parquet", os.path.basename(i))[0] for i in chunk_files],
214 | }).sort_values('chunk')
215 | for chrom, chr_df in chunk_df.groupby('chr', sort=False):
216 | print(f"\rConcatenating outputs for {chrom}", end='' if chrom != chunk_df['chr'].iloc[-1] else None)
217 | pd.concat([pd.read_parquet(f) for f in chr_df['file']]).reset_index(drop=True).to_parquet(
218 | os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_pairs.{chrom}.parquet"))
219 | for f in chr_df['file']:
220 | os.remove(f)
221 | # concatenate interaction results
222 | if interaction_df is not None:
223 | chunk_files = [os.path.join(args.output_dir, f"{c}.cis_qtl_top_assoc.txt.gz") for c in chunks]
224 | pd.concat([pd.read_csv(f, sep='\t', index_col=0, dtype=str) for f in chunk_files]).to_csv(
225 | os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_top_assoc.txt.gz"), sep='\t')
226 | for f in chunk_files:
227 | os.remove(f)
228 |
229 | elif args.mode == 'cis_independent':
230 | summary_df = pd.read_csv(args.cis_output, sep='\t', index_col=0)
231 | summary_df.rename(columns={'minor_allele_samples':'ma_samples', 'minor_allele_count':'ma_count'}, inplace=True)
232 | if args.chunk_size is None:
233 | res_df = cis.map_independent(genotype_df, variant_df, summary_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df,
234 | group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window,
235 | maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True)
236 | else:
237 | res_df = []
238 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
239 | dosages=args.dosages, verbose=True):
240 | res_df.append(cis.map_independent(gt_df, var_df, summary_df, p_df, p_pos_df, covariates_df=covariates_df,
241 | group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window,
242 | maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True))
243 | res_df = pd.concat(res_df).reset_index(drop=True)
244 | logger.write(' * writing output')
245 | out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_independent_qtl.txt.gz')
246 | res_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g')
247 |
248 | elif args.mode == 'cis_susie':
249 | if args.cis_output.endswith('.parquet'):
250 | signif_df = pd.read_parquet(args.cis_output)
251 | else:
252 | signif_df = pd.read_csv(args.cis_output, sep='\t')
253 | if 'qval' in signif_df: # otherwise input is from get_significant_pairs
254 | signif_df = signif_df[signif_df['qval'] <= args.fdr]
255 | phenotype_ids = phenotype_df.index[phenotype_df.index.isin(signif_df['phenotype_id'].unique())]
256 | phenotype_df = phenotype_df.loc[phenotype_ids]
257 | phenotype_pos_df = phenotype_pos_df.loc[phenotype_ids]
258 | if args.chunk_size is None:
259 | summary_df, res = susie.map(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
260 | covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects,
261 | maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False)
262 | else:
263 | summary_df = []
264 | res = {}
265 | for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
266 | dosages=args.dosages, verbose=True):
267 | chunk_summary_df, chunk_res = susie.map(gt_df, var_df, p_df, p_pos_df,
268 | covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects,
269 | maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False)
270 | summary_df.append(chunk_summary_df)
271 | res |= chunk_res
272 | summary_df = pd.concat(summary_df).reset_index(drop=True)
273 |
274 | summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet'))
275 | with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f:
276 | pickle.dump(res, f)
277 |
278 | elif args.mode == 'trans_susie':
279 | assert args.susie_loci is not None
280 | if args.susie_loci.endswith('.parquet'):
281 | locus_df = pd.read_parquet(args.susie_loci)
282 | else:
283 | locus_df = pd.read_csv(args.susie_loci, sep='\t')
284 | locus_df.rename(columns={'position':'pos'}, inplace=True)
285 | if args.chunk_size is None:
286 | assert variant_df is not None
287 | summary_df, res = susie.map_loci(locus_df, genotype_df, variant_df, phenotype_df, covariates_df,
288 | maf_threshold=maf_threshold, max_iter=500, window=args.window)
289 | else:
290 | raise NotImplementedError()
291 |
292 | summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet'))
293 | with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f:
294 | pickle.dump(res, f)
295 |
296 | elif args.mode == 'trans':
297 | return_sparse = not args.return_dense
298 | if return_sparse:
299 | logger.write(f' * p-value threshold: {args.pval_threshold:.2g}')
300 |
301 | if interaction_df is not None:
302 | if interaction_df.shape[1] > 1:
303 | raise NotImplementedError('trans-QTL mapping currently only supports a single interaction.')
304 | else:
305 | interaction_df = interaction_df.squeeze('columns')
306 |
307 | if args.chunk_size is None:
308 | pairs_df = trans.map_trans(genotype_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df,
309 | return_sparse=return_sparse, pval_threshold=args.pval_threshold,
310 | maf_threshold=maf_threshold, batch_size=args.batch_size,
311 | return_r2=args.return_r2, logger=logger)
312 | if args.return_dense:
313 | pval_df, b_df, b_se_df, af_s = pairs_df
314 | else:
315 | pairs_df = []
316 | n, rem = np.divmod(pgr.num_variants, int(args.chunk_size))
317 | bounds = [0] + n * [int(args.chunk_size)]
318 | if rem != 0:
319 | bounds.append(rem)
320 | bounds = np.cumsum(bounds)
321 | nchunks = len(bounds)-1
322 | for i in range(nchunks):
323 | print(f"Processing genotype chunk {i+1}/{nchunks}")
324 | if args.dosages:
325 | gt_df = pgr.read_dosages_range(bounds[i], bounds[i+1]-1, dtype=np.float32)
326 | else:
327 | gt_df = pgr.read_range(bounds[i], bounds[i+1]-1, impute_mean=False, dtype=np.int8)
328 | pairs_df.append(trans.map_trans(gt_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df,
329 | return_sparse=return_sparse, pval_threshold=args.pval_threshold,
330 | maf_threshold=maf_threshold, batch_size=args.batch_size,
331 | return_r2=args.return_r2, logger=logger))
332 | pairs_df = pd.concat(pairs_df).reset_index(drop=True)
333 | variant_df = pgr.variant_df
334 |
335 | if return_sparse:
336 | if variant_df is not None and phenotype_pos_df is not None:
337 | logger.write(' * filtering out cis-QTLs (within +/-5Mb)')
338 | pairs_df = trans.filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000)
339 |
340 | logger.write(' * writing sparse output')
341 | if not args.output_text:
342 | pairs_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.parquet'))
343 | else:
344 | out_file = os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.txt.gz')
345 | pairs_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g')
346 | else:
347 | logger.write(' * writing dense output')
348 | pval_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pval.parquet'))
349 | b_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta.parquet'))
350 | b_se_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta_se.parquet'))
351 | af_s.to_frame().to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_af.parquet'))
352 |
353 | logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Finished mapping')
354 |
355 |
356 | if __name__ == '__main__':
357 | main()
358 |
--------------------------------------------------------------------------------
/tensorqtl/trans.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils import data
3 | import numpy as np
4 | import pandas as pd
5 | import scipy.stats as stats
6 | from collections import OrderedDict
7 | import sys
8 | import os
9 | import time
10 |
11 | sys.path.insert(1, os.path.dirname(__file__))
12 | import genotypeio
13 | from core import *
14 |
15 |
16 | def _in_cis(chrom, pos, gene_id, pos_dict, window=1000000):
17 | """Test if a variant is within +/-window of a gene's TSS."""
18 | if chrom == pos_dict[gene_id]['chr']:
19 | gene_dict = pos_dict[gene_id]
20 | if 'pos' in gene_dict:
21 | start = gene_dict['pos']
22 | end = start
23 | else:
24 | start = gene_dict['start']
25 | end = gene_dict['end']
26 | if pos >= start - window and pos <= end + window:
27 | return True
28 | else:
29 | return False
30 | else:
31 | return False
32 |
33 |
34 | def filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000):
35 | """Filter out cis-QTLs
36 |
37 | Args:
38 | pairs_df: sparse output from map_trans()
39 | pos_dict: phenotype_id -> pos
40 | window: filter variants within +/-window of feature position (e.g., TSS for genes)
41 | """
42 | pos_dict = phenotype_pos_df.T.to_dict()
43 | variant_df = variant_df.loc[pairs_df['variant_id'].unique()].copy()
44 | variant_dict = {v:{'chrom':c, 'pos':p} for v,c,p in zip(variant_df.index, variant_df['chrom'], variant_df['pos'])}
45 |
46 | drop_ix = []
47 | for k,gene_id,variant_id in zip(pairs_df['phenotype_id'].index, pairs_df['phenotype_id'], pairs_df['variant_id']):
48 | if _in_cis(variant_dict[variant_id]['chrom'], variant_dict[variant_id]['pos'], gene_id, pos_dict, window=window):
49 | drop_ix.append(k)
50 | return pairs_df.drop(drop_ix)
51 |
52 |
53 | def map_trans(genotype_df, phenotype_df, covariates_df=None, interaction_s=None,
54 | return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05,
55 | alleles=2, return_r2=False, batch_size=20000,
56 | logp=False, logger=None, verbose=True):
57 | """Run trans-QTL mapping
58 |
59 | Outputs (return_sparse == True):
60 | pval_df: DataFrame with columns variant_id, phenotype_id, pval, b, b_se, af
61 | Outputs (return_sparse == False):
62 | pval_df
63 | b_df
64 | b_se_df
65 | af_s
66 | """
67 |
68 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
69 |
70 | if logger is None:
71 | logger = SimpleLogger(verbose=verbose)
72 |
73 | variant_ids = genotype_df.index.tolist()
74 | variant_dict = {i:j for i,j in enumerate(variant_ids)}
75 | n_variants = len(variant_ids)
76 | n_samples = phenotype_df.shape[1]
77 | dof = n_samples - 2
78 |
79 | logger.write('trans-QTL mapping')
80 | logger.write(f' * {n_samples} samples')
81 | logger.write(f' * {phenotype_df.shape[0]} phenotypes')
82 | if covariates_df is not None:
83 | assert np.all(phenotype_df.columns==covariates_df.index)
84 | logger.write(f' * {covariates_df.shape[1]} covariates')
85 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
86 | dof -= covariates_df.shape[1]
87 | else:
88 | residualizer = None
89 | logger.write(f' * {n_variants} variants')
90 | if interaction_s is not None:
91 | logger.write(' * including interaction term')
92 |
93 | phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
94 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
95 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
96 |
97 | # calculate correlation threshold for sparse output
98 | if return_sparse:
99 | tstat_threshold = -stats.t.ppf(pval_threshold/2, dof)
100 | r_threshold = tstat_threshold / np.sqrt(dof + tstat_threshold**2)
101 | else:
102 | tstat_threshold = None
103 | r_threshold = None
104 |
105 | if interaction_s is None:
106 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
107 | start_time = time.time()
108 | res = []
109 | n_variants = 0
110 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
111 | # copy genotypes to GPU
112 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
113 |
114 | # filter by MAF
115 | genotypes_t = genotypes_t[:,genotype_ix_t]
116 | impute_mean(genotypes_t)
117 | genotypes_t, variant_ids, af_t = filter_maf(genotypes_t, variant_ids, maf_threshold)
118 | n_variants += genotypes_t.shape[0]
119 |
120 | r_t, genotype_var_t, phenotype_var_t = calculate_corr(genotypes_t, phenotypes_t, residualizer=residualizer, return_var=True)
121 | del genotypes_t
122 |
123 | if return_sparse:
124 | m = r_t.abs() >= r_threshold
125 | ix_t = m.nonzero(as_tuple=False) # sparse index
126 | ix = ix_t.cpu().numpy()
127 |
128 | r_t = r_t.masked_select(m).type(torch.float64)
129 | r2_t = r_t.pow(2)
130 | tstat_t = r_t * torch.sqrt(dof / (1 - r2_t))
131 | std_ratio_t = torch.sqrt(phenotype_var_t[ix_t[:,1]] / genotype_var_t[ix_t[:,0]])
132 | b_t = r_t * std_ratio_t
133 | b_se_t = (b_t / tstat_t).type(torch.float32)
134 |
135 | res.append(np.c_[
136 | variant_ids[ix[:,0]], phenotype_df.index[ix[:,1]],
137 | tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(),
138 | r2_t.float().cpu(), af_t[ix_t[:,0]].cpu()
139 | ])
140 | else: # dense output: pval, b, b_se, af
141 | r_t = r_t.type(torch.float64)
142 | tstat_t = r_t * torch.sqrt(dof / (1 - r_t.pow(2)))
143 | std_ratio_t = torch.sqrt(phenotype_var_t / genotype_var_t.reshape(-1,1))
144 | b_t = (r_t * std_ratio_t).type(torch.float32)
145 | b_se_t = (b_t / tstat_t).type(torch.float32)
146 | res.append([variant_ids, tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(), af_t.cpu()])
147 |
148 | logger.write(f' elapsed time: {(time.time()-start_time)/60:.2f} min')
149 | del phenotypes_t
150 | del residualizer
151 |
152 | if maf_threshold > 0:
153 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering')
154 |
155 | # post-processing: concatenate batches
156 | if return_sparse:
157 | res = np.concatenate(res)
158 | res[:,2] = get_t_pval(res[:,2].astype(np.float64), dof, log=logp)
159 | pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'pval', 'b', 'b_se', 'r2', 'af'])
160 | pval_df['pval'] = pval_df['pval'].astype(np.float64)
161 | pval_df['b'] = pval_df['b'].astype(np.float32)
162 | pval_df['b_se'] = pval_df['b_se'].astype(np.float32)
163 | pval_df['r2'] = pval_df['r2'].astype(np.float32)
164 | pval_df['af'] = pval_df['af'].astype(np.float32)
165 | if not return_r2:
166 | pval_df.drop('r2', axis=1, inplace=True)
167 | logger.write('done.')
168 | return pval_df
169 | else:
170 | variant_ids = pd.Series(np.concatenate([i[0] for i in res]), name='variant_id')
171 | pval_df = pd.DataFrame(get_t_pval(np.concatenate([i[1] for i in res]).astype(np.float64), dof, log=logp),
172 | index=variant_ids, columns=phenotype_df.index)
173 | b_df = pd.DataFrame(np.concatenate([i[2] for i in res]),
174 | index=variant_ids, columns=phenotype_df.index)
175 | b_se_df = pd.DataFrame(np.concatenate([i[3] for i in res]),
176 | index=variant_ids, columns=phenotype_df.index)
177 | af_s = pd.Series(np.concatenate([i[4] for i in res]),
178 | index=variant_ids, name='af')
179 | logger.write('done.')
180 | return pval_df, b_df, b_se_df, af_s
181 |
182 |
183 | else: # interaction model
184 | dof = n_samples - 4 - covariates_df.shape[1]
185 | interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device) # 1 x n_samples
186 | mask_s = pd.Series(True, index=interaction_s.index)
187 | mask_s[interaction_s.sort_values(kind='mergesort').index[:interaction_s.shape[0]//2]] = False
188 | interaction_mask_t = torch.BoolTensor(mask_s.values).to(device)
189 |
190 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
191 | start_time = time.time()
192 | if return_sparse:
193 |
194 | nps = phenotypes_t.shape[0]
195 | i0_t = interaction_t - interaction_t.mean()
196 | p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)
197 | p0_t = residualizer.transform(p0_t, center=False)
198 | i0_t = residualizer.transform(i0_t, center=False)
199 |
200 | tstat_g_list = []
201 | tstat_i_list = []
202 | tstat_gi_list = []
203 | af_list = []
204 | ix0 = []
205 | ix1 = []
206 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
207 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
208 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t],
209 | interaction_mask_t=interaction_mask_t,
210 | maf_threshold_interaction=maf_threshold)
211 | if genotypes_t.shape[0] > 0:
212 | ng, ns = genotypes_t.shape
213 |
214 | # calculate allele frequency
215 | af_t = genotypes_t.sum(1) / (2*ns)
216 |
217 | # centered inputs
218 | g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
219 | gi_t = genotypes_t * interaction_t
220 | gi0_t = gi_t - gi_t.mean(1, keepdim=True)
221 | # residualize rows
222 | g0_t = residualizer.transform(g0_t, center=False)
223 | gi0_t = residualizer.transform(gi0_t, center=False)
224 |
225 | # regression
226 | X_t = torch.stack([g0_t, i0_t.repeat(ng, 1), gi0_t], 2) # ng x ns x 3
227 | Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3
228 | b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), p0_t.t()) # ng x 3 x np
229 | dof = residualizer.dof - 2
230 |
231 | rss_t = (torch.matmul(X_t, b_t) - p0_t.t()).pow(2).sum(1) # ng x np
232 | b_se_t = torch.sqrt(Xinv[:, torch.eye(3, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof)
233 | tstat_t = (b_t.double() / b_se_t.double()).float() # (ng x 3 x np)
234 | tstat_g_t = tstat_t[:,0,:] # genotypes x phenotypes
235 | tstat_i_t = tstat_t[:,1,:]
236 | tstat_gi_t = tstat_t[:,2,:]
237 | m = tstat_gi_t.abs() >= tstat_threshold
238 | tstat_g_t = tstat_g_t[m]
239 | tstat_i_t = tstat_i_t[m]
240 | tstat_gi_t = tstat_gi_t[m]
241 | ix = m.nonzero(as_tuple=False) # indexes: [genotype, phenotype]
242 | af_t = af_t[ix[:,0]]
243 |
244 | res = [tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix]
245 | tstat_g, tstat_i, tstat_gi, af, ix = [i.cpu().numpy() for i in res]
246 | mask = mask_t.cpu().numpy()
247 | # convert sparse indexes
248 | if len(ix)>0:
249 | variant_ids = variant_ids[mask.astype(bool)]
250 | tstat_g_list.append(tstat_g)
251 | tstat_i_list.append(tstat_i)
252 | tstat_gi_list.append(tstat_gi)
253 | af_list.append(af)
254 | ix0.extend(variant_ids[ix[:,0]].tolist())
255 | ix1.extend(phenotype_df.index[ix[:,1]].tolist())
256 |
257 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min')
258 |
259 | # concatenate
260 | pval_g = get_t_pval(np.concatenate(tstat_g_list), dof, log=logp)
261 | pval_i = get_t_pval(np.concatenate(tstat_i_list), dof, log=logp)
262 | pval_gi = get_t_pval(np.concatenate(tstat_gi_list), dof, log=logp)
263 | af = np.concatenate(af_list)
264 |
265 | pval_df = pd.DataFrame(np.c_[ix0, ix1, pval_g, pval_i, pval_gi, af],
266 | columns=['variant_id', 'phenotype_id', 'pval_g', 'pval_i', 'pval_gi', 'af']
267 | ).astype({'pval_g':np.float64, 'pval_i':np.float64, 'pval_gi':np.float64, 'af':np.float32})
268 | return pval_df
269 | else: # dense output
270 | output_list = []
271 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
272 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
273 | genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t],
274 | interaction_mask_t=interaction_mask_t,
275 | maf_threshold_interaction=maf_threshold)
276 | res = calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t.t(), residualizer,
277 | return_sparse=return_sparse)
278 | # res: tstat, b, b_se, af, ma_samples, ma_count
279 | res = [i.cpu().numpy() for i in res]
280 | mask = mask_t.cpu().numpy()
281 | variant_ids = variant_ids[mask.astype(bool)]
282 | output_list.append(res + [variant_ids])
283 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min')
284 |
285 | # concatenate outputs
286 | tstat = np.concatenate([i[0] for i in output_list])
287 | pval = get_t_pval(tstat, dof, log=logp)
288 | b = np.concatenate([i[1] for i in output_list])
289 | b_se = np.concatenate([i[2] for i in output_list])
290 | af = np.concatenate([i[3] for i in output_list])
291 | ma_samples = np.concatenate([i[4] for i in output_list])
292 | ma_count = np.concatenate([i[5] for i in output_list])
293 | variant_ids = np.concatenate([i[6] for i in output_list])
294 |
295 | pval_g_df = pd.DataFrame(pval[:,0,:], index=variant_ids, columns=phenotype_df.index)
296 | pval_i_df = pd.DataFrame(pval[:,1,:], index=variant_ids, columns=phenotype_df.index)
297 | pval_gi_df = pd.DataFrame(pval[:,2,:], index=variant_ids, columns=phenotype_df.index)
298 | af_s = pd.Series(af, index=variant_ids, name='af').astype(np.float32)
299 | ma_samples_s = pd.Series(ma_samples, index=variant_ids, name='ma_samples').astype(np.int32)
300 | ma_count_s = pd.Series(ma_count, index=variant_ids, name='ma_counts').astype(np.int32)
301 | return pval_g_df, pval_i_df, pval_gi_df, af_s, ma_samples_s, ma_count_s
302 |
303 |
304 | def map_permutations(genotype_df, covariates_df, permutations=None,
305 | chr_s=None, nperms=10000, maf_threshold=0.05,
306 | batch_size=20000, logger=None, seed=None, verbose=True):
307 | """
308 |
309 |
310 | Warning: this function assumes that all phenotypes are normally distributed,
311 | e.g., inverse normal transformed
312 | """
313 |
314 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
315 |
316 | if logger is None:
317 | logger = SimpleLogger()
318 | assert covariates_df.index.isin(genotype_df.columns).all()
319 | sample_ids = covariates_df.index.values
320 |
321 | variant_ids = genotype_df.index.tolist()
322 |
323 | # index of VCF samples corresponding to phenotypes
324 | genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids])
325 | genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
326 |
327 | n_variants = len(variant_ids)
328 | n_samples = len(sample_ids)
329 | dof = n_samples - 2 - covariates_df.shape[1]
330 |
331 | logger.write('trans-QTL mapping (permutations)')
332 | logger.write(f' * {n_samples} samples')
333 | logger.write(f' * {covariates_df.shape[1]} covariates')
334 | logger.write(f' * {n_variants} variants')
335 |
336 | if permutations is None: # generate permutations assuming normal distribution
337 | q = stats.norm.ppf(np.arange(1,n_samples+1)/(n_samples+1))
338 | permutations = np.tile(q,[nperms,1])
339 | if seed is not None:
340 | np.random.seed(seed)
341 | for i in np.arange(nperms):
342 | np.random.shuffle(permutations[i,:])
343 | else:
344 | assert permutations.shape[1]==n_samples
345 | nperms = permutations.shape[0]
346 | logger.write(f' * {nperms} permutations')
347 |
348 | permutations_t = torch.tensor(permutations, dtype=torch.float32).to(device)
349 | residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
350 |
351 | if chr_s is not None:
352 | assert chr_s.index.equals(genotype_df.index)
353 | start_time = time.time()
354 | n_variants = 0
355 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size, chr_s=chr_s)
356 | total_batches = np.sum([len(ggt.chr_batch_indexes[c]) for c in ggt.chroms])
357 |
358 | chr_max_r2 = OrderedDict()
359 | k = 0
360 | for chrom in ggt.chroms:
361 | max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
362 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(chrom=chrom, verbose=verbose, enum_start=k+1), k+1):
363 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
364 | genotypes_t = genotypes_t[:, genotype_ix_t]
365 | impute_mean(genotypes_t)
366 | genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold)
367 | n_variants += genotypes_t.shape[0]
368 |
369 | r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2)
370 | del genotypes_t
371 | m,_ = r2_t.max(0)
372 | max_r2_t = torch.max(m, max_r2_t)
373 | chr_max_r2[chrom] = max_r2_t.cpu()
374 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min')
375 | if maf_threshold > 0:
376 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering')
377 | chr_max_r2 = pd.DataFrame(chr_max_r2)
378 |
379 | # leave-one-out max
380 | max_r2 = OrderedDict()
381 | for c in chr_max_r2:
382 | max_r2[c] = chr_max_r2[np.setdiff1d(chr_max_r2.columns, c)].max(1)
383 | max_r2 = pd.DataFrame(max_r2) # nperms x chrs
384 |
385 | # empirical p-values
386 | tstat = np.sqrt( dof*max_r2 / (1-max_r2) )
387 | minp_empirical = pd.DataFrame(2*stats.t.cdf(-np.abs(tstat), dof), columns=tstat.columns) # nperms x chrs
388 |
389 | beta_shape1 = OrderedDict()
390 | beta_shape2 = OrderedDict()
391 | true_dof = OrderedDict()
392 | minp_vec = OrderedDict()
393 | for c in max_r2:
394 | beta_shape1[c], beta_shape2[c], true_dof[c], minp_vec[c] = fit_beta_parameters(max_r2[c], dof, return_minp=True)
395 |
396 | beta_df = pd.DataFrame(OrderedDict([
397 | ('beta_shape1', beta_shape1),
398 | ('beta_shape2', beta_shape2),
399 | ('true_df', true_dof),
400 | ('minp_true_df', minp_vec),
401 | ('minp_empirical', {c:minp_empirical[c].values for c in minp_empirical}),
402 | ]))
403 | return beta_df
404 |
405 | else: # not split_chr
406 | ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
407 | start_time = time.time()
408 | max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
409 | n_variants = 0
410 | for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
411 | genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
412 | genotypes_t = genotypes_t[:, genotype_ix_t]
413 | impute_mean(genotypes_t)
414 | genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold)
415 | n_variants += genotypes_t.shape[0]
416 |
417 | r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2)
418 | del genotypes_t
419 | m,_ = r2_t.max(0)
420 | max_r2_t = torch.max(m, max_r2_t)
421 | logger.write(f' time elapsed: {(time.time()-start_time)/60:.2f} min')
422 | if maf_threshold > 0:
423 | logger.write(f' * {n_variants} variants passed MAF >= {maf_threshold} filtering')
424 | max_r2 = max_r2_t.cpu().numpy().astype(np.float64)
425 | tstat = np.sqrt( dof*max_r2 / (1-max_r2) )
426 | minp_empirical = 2*stats.t.cdf(-np.abs(tstat), dof)
427 | beta_shape1, beta_shape2, true_dof, minp_vec = fit_beta_parameters(max_r2, dof, tol=1e-4, return_minp=True)
428 |
429 | beta_s = pd.Series([n_samples, dof, beta_shape1, beta_shape2, true_dof, minp_vec, minp_empirical],
430 | index=['num_samples', 'df', 'beta_shape1', 'beta_shape2', 'true_df', 'minp_true_df', 'minp_empirical'])
431 | return beta_s
432 |
433 |
434 | def apply_permutations(res, pairs_df):
435 | """
436 | res: output from map_permutations()
437 | pairs_df: output from map_trans()
438 | """
439 |
440 | if isinstance(res, pd.Series): # chrs not split
441 | nperms = len(res['minp_true_df'])
442 | for k in ['beta_shape1', 'beta_shape2', 'true_df']:
443 | pairs_df[k] = res[k]
444 | pairs_df['pval_true_dof'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df'])
445 | pairs_df['pval_perm'] = np.array([(np.sum(res['minp_empirical']<=p)+1)/(nperms+1) for p in pairs_df['pval']])
446 | pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_dof'], pairs_df['beta_shape1'], pairs_df['beta_shape2'])
447 |
448 | elif isinstance(res, pd.DataFrame): # chrs split
449 | nperms = len(res['minp_empirical'][0])
450 | for k in ['beta_shape1', 'beta_shape2', 'true_df']:
451 | pairs_df[k] = res.loc[pairs_df['phenotype_chr'], k].values
452 | pairs_df['pval_true_df'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df'])
453 | pairs_df['pval_perm'] = [(np.sum(pe<=p)+1)/(nperms+1) for p,pe in zip(pairs_df['pval'], res.loc[pairs_df['phenotype_chr'], 'minp_empirical'])]
454 | # pval_perm = np.array([(np.sum(minp_empirical[chrom]<=p)+1)/(nperms+1) for p, chrom in zip(pval_df['pval'], pval_df['phenotype_chr'])])
455 | # pval_perm = np.array([(np.sum(minp_empirical<=p)+1)/(nperms+1) for p in minp_nominal])
456 | pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_df'], pairs_df['beta_shape1'], pairs_df['beta_shape2'])
457 |
--------------------------------------------------------------------------------