├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docs
    └── outputs.md
├── example
    ├── GTEx_v8_example.ipynb
    ├── data
    │   ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen
    │   ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam
    │   ├── GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pvar
    │   ├── GEUVADIS.445_samples.covariates.txt
    │   └── GEUVADIS.445_samples.expression.bed.gz
    └── tensorqtl_examples.ipynb
├── install
    ├── INSTALL.md
    ├── install_cuda.sh
    └── tensorqtl_env.yml
├── pyproject.toml
└── tensorqtl
    ├── __init__.py
    ├── __main__.py
    ├── cis.py
    ├── coloc.py
    ├── core.py
    ├── eigenmt.py
    ├── genotypeio.py
    ├── mixqtl.py
    ├── pgen.py
    ├── post.py
    ├── rfunc.py
    ├── susie.py
    ├── tensorqtl.py
    └── trans.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.egg-info/
3 | *.ipynb_checkpoints/
4 | build/
5 | dist/
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for tensorQTL
 2 | # https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/unsupported-tags.md
 3 | FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
 4 | MAINTAINER Francois Aguet
 5 | 
 6 | RUN apt-get update && apt-get install -y software-properties-common && \
 7 |     apt-get update && apt-get install -y \
 8 |         apt-transport-https \
 9 |         build-essential \
10 |         cmake \
11 |         curl \
12 |         libboost-all-dev \
13 |         libbz2-dev \
14 |         libcurl3-dev \
15 |         liblzma-dev \
16 |         libncurses5-dev \
17 |         libssl-dev \
18 |         python3 \
19 |         python3-pip \
20 |         sudo \
21 |         unzip \
22 |         wget \
23 |         zlib1g-dev \
24 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
25 |     apt-get clean && \
26 |     apt-get autoremove -y && \
27 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
28 | 
29 | # htslib
30 | RUN cd /opt && \
31 |     wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19/htslib-1.19.tar.bz2 && \
32 |     tar -xf htslib-1.19.tar.bz2 && rm htslib-1.19.tar.bz2 && cd htslib-1.19 && \
33 |     ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \
34 |     make && make install && make clean
35 | 
36 | # bcftools
37 | RUN cd /opt && \
38 |     wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \
39 |     tar -xf bcftools-1.19.tar.bz2 && rm bcftools-1.19.tar.bz2 && cd bcftools-1.19 && \
40 |     ./configure --with-htslib=system && make && make install && make clean
41 | 
42 | # install R
43 | ENV DEBIAN_FRONTEND noninteractive
44 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
45 | RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/'
46 | RUN apt update && apt install -y r-base r-base-dev
47 | ENV R_LIBS_USER=/opt/R/4.0
48 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) {install.packages("BiocManager")}; BiocManager::install("qvalue");'
49 | 
50 | # python modules
51 | RUN pip3 install --upgrade pip setuptools
52 | RUN pip3 install numpy pandas scipy
53 | RUN pip3 install pandas-plink ipython jupyter matplotlib pyarrow torch rpy2 gcsfs Pgenlib>=0.90.1
54 | RUN pip3 install tensorqtl==1.0.9
55 | 
56 | # RUN cd /opt && \
57 | #     wget https://github.com/broadinstitute/tensorqtl/archive/v1.0.8.tar.gz && \
58 | #     tar -xf v1.0.8.tar.gz && mv tensorqtl-1.0.8 tensorqtl && \
59 | #     rm v1.0.8.tar.gz
60 | # RUN pip3 install /opt/tensorqtl/
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018-2019, The Broad Institute, Inc. and The General Hospital Corporation.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## tensorQTL
  2 | 
  3 | tensorQTL is a GPU-enabled QTL mapper, achieving ~200-300 fold faster *cis*- and *trans*-QTL mapping compared to CPU-based implementations.
  4 | 
  5 | If you use tensorQTL in your research, please cite the following paper:
  6 | [Taylor-Weiner, Aguet, et al., *Genome Biol.*, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7).</br>
  7 | Empirical beta-approximated p-values are computed as described in [Ongen et al., *Bioinformatics*, 2016](https://academic.oup.com/bioinformatics/article/32/10/1479/1742545).
  8 | 
  9 | ### Install
 10 | You can install tensorQTL using pip:
 11 | ```
 12 | pip3 install tensorqtl
 13 | ```
 14 | or directly from this repository:
 15 | ```
 16 | $ git clone git@github.com:broadinstitute/tensorqtl.git
 17 | $ cd tensorqtl
 18 | # install into a new virtual environment and load
 19 | $ mamba env create -f install/tensorqtl_env.yml
 20 | $ conda activate tensorqtl
 21 | ```
 22 | To install the latest version from this repository, run
 23 | ```
 24 | pip install pip@git+https://github.com/broadinstitute/tensorqtl.git
 25 | ```
 26 | 
 27 | To use PLINK 2 binary files ([pgen/pvar/psam](https://www.cog-genomics.org/plink/2.0/input#pgen)), [pgenlib](https://github.com/chrchang/plink-ng/tree/master/2.0/Python) must be installed using either
 28 | ```
 29 | pip install Pgenlib
 30 | ```
 31 | (this is included in `tensorqtl_env.yml` above), or from the source :
 32 | ```
 33 | git clone git@github.com:chrchang/plink-ng.git
 34 | cd plink-ng/2.0/Python/
 35 | python3 setup.py build_ext
 36 | python3 setup.py install
 37 | ```
 38 | 
 39 | ### Requirements
 40 | 
 41 | tensorQTL requires an environment configured with a GPU for optimal performance, but can also be run on a CPU. Instructions for setting up a virtual machine on Google Cloud Platform are provided [here](install/INSTALL.md).
 42 | 
 43 | ### Input formats
 44 | Three inputs are required for QTL analyses with tensorQTL: genotypes, phenotypes, and covariates. 
 45 | * Phenotypes must be provided in BED format, with a single header line starting with `#` and the first four columns corresponding to: `chr`, `start`, `end`, `phenotype_id`, with the remaining columns corresponding to samples (the identifiers must match those in the genotype input). In addition to .bed/.bed.gz, BED input in .parquet is also supported. The BED file can specify the center of the *cis*-window (usually the TSS), with `start == end-1`, or alternatively, start and end positions, in which case the *cis*-window is [start-window, end+window]. A function for generating a BED template from a gene annotation in GTF format is available in [pyqtl](https://github.com/broadinstitute/pyqtl) (`io.gtf_to_tss_bed`).
 46 | * Covariates can be provided as a tab-delimited text file (covariates x samples) or dataframe (samples x covariates), with row and column headers.
 47 | * Genotypes should preferrably be in [PLINK2](https://www.cog-genomics.org/plink/2.0/) pgen/pvar/psam format, which can be generated from a VCF as follows:
 48 |   ```
 49 |   plink2 \
 50 |       --output-chr chrM \
 51 |       --vcf ${plink_prefix_path}.vcf.gz \
 52 |       --out ${plink_prefix_path}
 53 |   ```
 54 |   If using `--make-bed` with PLINK 1.9 or earlier, add the `--keep-allele-order` flag. 
 55 |   
 56 |   Alternatively, the genotypes can be provided in bed/bim/fam format, or as a parquet dataframe (genotypes x samples). 
 57 | 
 58 | 
 59 | The [examples notebook](example/tensorqtl_examples.ipynb) below contains examples of all input files. The input formats for phenotypes and covariates are identical to those used by [FastQTL](https://github.com/francois-a/fastqtl).
 60 | 
 61 | ### Examples
 62 | For examples illustrating *cis*- and *trans*-QTL mapping, please see [tensorqtl_examples.ipynb](example/tensorqtl_examples.ipynb).
 63 | 
 64 | ### Running tensorQTL
 65 | This section describes how to run the different modes of tensorQTL, both from the command line and within Python.
 66 | For a full list of options, run
 67 | ```
 68 | python3 -m tensorqtl --help
 69 | ```
 70 | 
 71 | #### Loading input files
 72 | This section is only relevant when running tensorQTL in Python.
 73 | The following imports are required:
 74 | ```
 75 | import pandas as pd
 76 | import tensorqtl
 77 | from tensorqtl import genotypeio, cis, trans
 78 | ```
 79 | Phenotypes and covariates can be loaded as follows:
 80 | ```
 81 | phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(phenotype_bed_file)
 82 | covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T  # samples x covariates
 83 | ```
 84 | Genotypes can be loaded as follows, where `plink_prefix_path` is the path to the VCF in PLINK format (excluding `.bed`/`.bim`/`.fam` extensions):
 85 | ```
 86 | pr = genotypeio.PlinkReader(plink_prefix_path)
 87 | # load genotypes and variants into data frames
 88 | genotype_df = pr.load_genotypes()
 89 | variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
 90 | ```
 91 | To save memory when using genotypes for a subset of samples, a subset of samples can be loaded (this is not strictly necessary, since tensorQTL will select the relevant samples from `genotype_df` otherwise):
 92 | ```
 93 | pr = genotypeio.PlinkReader(plink_prefix_path, select_samples=phenotype_df.columns)
 94 | ```
 95 | 
 96 | #### *cis*-QTL mapping: permutations
 97 | This is the main mode for *cis*-QTL mapping. It generates phenotype-level summary statistics with empirical p-values, enabling calculation of genome-wide FDR.
 98 | In Python:
 99 | ```
100 | cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df)
101 | tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)
102 | ```
103 | Shell command:
104 | ```
105 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
106 |     --covariates ${covariates_file} \
107 |     --mode cis
108 | ```
109 | `${prefix}` specifies the output file name.
110 | 
111 | #### *cis*-QTL mapping: summary statistics for all variant-phenotype pairs
112 | In Python:
113 | ```
114 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
115 |                 prefix, covariates_df, output_dir='.')
116 | ```
117 | Shell command:
118 | ```
119 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
120 |     --covariates ${covariates_file} \
121 |     --mode cis_nominal
122 | ```
123 | The results are written to a [parquet](https://parquet.apache.org/) file for each chromosome. These files can be read using `pandas`:
124 | ```
125 | df = pd.read_parquet(file_name)
126 | ```
127 | #### *cis*-QTL mapping: conditionally independent QTLs
128 | This mode maps conditionally independent *cis*-QTLs using the stepwise regression procedure described in [GTEx Consortium, 2017](https://www.nature.com/articles/nature24277). The output from the permutation step (see `map_cis` above) is required.
129 | In Python:
130 | ```
131 | indep_df = cis.map_independent(genotype_df, variant_df, cis_df,
132 |                                phenotype_df, phenotype_pos_df, covariates_df)
133 | ```
134 | Shell command:
135 | ```
136 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
137 |     --covariates ${covariates_file} \
138 |     --cis_output ${prefix}.cis_qtl.txt.gz \
139 |     --mode cis_independent
140 | ```
141 | 
142 | #### *cis*-QTL mapping: interactions
143 | Instead of mapping the standard linear model (p ~ g), this mode includes an interaction term (p ~ g + i + gi) and returns full summary statistics for the model. The interaction term is a tab-delimited text file or dataframe mapping sample ID to interaction value(s) (if multiple interactions are used, the file must include a header with variable names). With the `run_eigenmt=True` option, [eigenMT](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)-adjusted p-values are computed.
144 | In Python:
145 | ```
146 | cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix,
147 |                 covariates_df=covariates_df,
148 |                 interaction_df=interaction_df, maf_threshold_interaction=0.05,
149 |                 run_eigenmt=True, output_dir='.', write_top=True, write_stats=True)
150 | ```
151 | The input options `write_top` and `write_stats` control whether the top association per phenotype and full summary statistics, respectively, are written to file.
152 | 
153 | Shell command:
154 | ```
155 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
156 |     --covariates ${covariates_file} \
157 |     --interaction ${interactions_file} \
158 |     --best_only \
159 |     --mode cis_nominal
160 | ```
161 | The option `--best_only` disables output of full summary statistics.
162 | 
163 | Full summary statistics are saved as [parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html) files for each chromosome, in `${output_dir}/${prefix}.cis_qtl_pairs.${chr}.parquet`, and the top association for each phenotype is saved to `${output_dir}/${prefix}.cis_qtl_top_assoc.txt.gz`. In these files, the columns `b_g`, `b_g_se`, `pval_g` are the effect size, standard error, and p-value of *g* in the model, with matching columns for *i* and *gi*. In the `*.cis_qtl_top_assoc.txt.gz` file, `tests_emt` is the effective number of independent variants in the cis-window estimated with eigenMT, i.e., based on the eigenvalue decomposition of the regularized genotype correlation matrix ([Davis et al., AJHG, 2016](https://www.cell.com/ajhg/fulltext/S0002-9297(15)00492-9)). `pval_emt = pval_gi * tests_emt`, and `pval_adj_bh` are the Benjamini-Hochberg adjusted p-values corresponding to `pval_emt`. 
164 | 
165 | #### *trans*-QTL mapping
166 | This mode computes nominal associations between all phenotypes and genotypes. tensorQTL generates sparse output by default (associations with p-value < 1e-5). *cis*-associations are filtered out. The output is in parquet format, with four columns: phenotype_id, variant_id, pval, maf.
167 | In Python:
168 | ```
169 | trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df,
170 |                            return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05,
171 |                            batch_size=20000)
172 | # remove cis-associations
173 | trans_df = trans.filter_cis(trans_df, phenotype_pos_df.T.to_dict(), variant_df, window=5000000)
174 | ```
175 | Shell command:
176 | ```
177 | python3 -m tensorqtl ${plink_prefix_path} ${expression_bed} ${prefix} \
178 |     --covariates ${covariates_file} \
179 |     --mode trans
180 | ```
181 | 
182 | 


--------------------------------------------------------------------------------
/docs/outputs.md:
--------------------------------------------------------------------------------
 1 | ### Output files
 2 | #### Mode `cis_nominal`
 3 | Column | Description
 4 | --- | ---
 5 | `phenotype_id` | Phenotype ID
 6 | `variant_id` | Variant ID
 7 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS)
 8 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position)
 9 | `af` | In-sample ALT allele frequency of the variant
10 | `ma_samples` | Number of samples carrying at least on minor allele
11 | `ma_count` | Number of minor alleles
12 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant
13 | `slope` | Regression slope
14 | `slope_se` | Standard error of the regression slope
15 | 
16 | #### Mode `cis_nominal`, with interaction term
17 | When an interaction term is included, the output additionally contains the following columns instead of `pval_nominal`, `slope`, `slope_se`:
18 | Column | Description
19 | --- | ---
20 | `pval_g` | Nominal p-value of the genotype term
21 | `b_g` | Slope of the genotype term
22 | `b_g_se` | Standard error of `b_g`
23 | `pval_i` | Nominal p-value of the interaction variable
24 | `b_i` | Slope of the interaction variable
25 | `b_i_se` | Standard error of `b_i`
26 | `pval_gi` | Nominal p-value of the interaction term
27 | `b_gi` | Slope of the interaction term
28 | `b_gi_se` | Standard error of `b_gi`
29 | `tests_emt` | Effective number of independent variants (M<sub>eff</sub>) estimated by eigenMT
30 | `pval_emt` | Bonferroni-adjusted `pval_gi` (i.e., multiplied by M<sub>eff</sub>)
31 | `pval_adj_bh` | Benjamini-Hochberg adjusted `pval_emt`
32 | 
33 | #### Mode `cis`
34 | Column | Description
35 | --- | ---
36 | `phenotype_id` | Phenotype ID
37 | `num_var` | Number of variants in *cis*-window
38 | `beta_shape1` | Parameter of the fitted Beta distribution
39 | `beta_shape2` | Parameter of the fitted Beta distribution
40 | `true_df` | Degrees of freedom used to compute p-values
41 | `pval_true_df` | Nominal p-value based on `true_df`
42 | `variant_id` | Variant ID
43 | `start_distance` | Distance between the variant and phenotype start position (e.g., TSS)
44 | `end_distance` | Distance between the variant and phenotype end position (only present if different from start position)
45 | `ma_samples` | Number of samples carrying at least on minor allele
46 | `ma_count` | Number of minor alleles
47 | `af` | In-sample ALT allele frequency of the variant
48 | `pval_nominal` | Nominal p-value of the association between the phenotype and variant
49 | `slope` | Regression slope
50 | `slope_se` | Standard error of the regression slope
51 | `pval_perm` | Empirical p-value from permutations
52 | `pval_beta` | Beta-approximated empirical p-value
53 | `qval` | Storey q-value corresponding to `pval_beta`
54 | `pval_nominal_threshold` | Nominal p-value threshold for significant associations with the phenotype
55 | 
56 | #### Mode `cis_independent`
57 | The columns are the same as for `cis`, excluding `qval` and `pval_nominal_threshold`, and adding:
58 | Column | Description
59 | --- | ---
60 | `rank` | Rank of the variant for the phenotype
61 | 
62 | #### Mode `trans`
63 | Column | Description
64 | --- | ---
65 | `variant_id` | Variant ID
66 | `phenotype_id` | Phenotype ID
67 | `pval` | Nominal p-value of the association between the phenotype and variant
68 | `b` | Regression slope
69 | `b_se` | Standard error of the regression slope
70 | `r2` | Squared residual genotype-phenotype correlation (only generated if `map_trans(..., return_r2=True)`)
71 | `af` | In-sample ALT allele frequency of the variant
72 | 


--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.pgen


--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18.psam:
--------------------------------------------------------------------------------
  1 | #IID	SEX
  2 | HG00096	1
  3 | HG00097	2
  4 | HG00099	2
  5 | HG00100	2
  6 | HG00101	1
  7 | HG00102	2
  8 | HG00103	1
  9 | HG00105	1
 10 | HG00106	2
 11 | HG00108	1
 12 | HG00109	1
 13 | HG00110	2
 14 | HG00111	2
 15 | HG00112	1
 16 | HG00114	1
 17 | HG00115	1
 18 | HG00116	1
 19 | HG00117	1
 20 | HG00118	2
 21 | HG00119	1
 22 | HG00120	2
 23 | HG00121	2
 24 | HG00122	2
 25 | HG00123	2
 26 | HG00125	2
 27 | HG00126	1
 28 | HG00127	2
 29 | HG00128	2
 30 | HG00129	1
 31 | HG00130	2
 32 | HG00131	1
 33 | HG00132	2
 34 | HG00133	2
 35 | HG00136	1
 36 | HG00137	2
 37 | HG00138	1
 38 | HG00139	1
 39 | HG00141	1
 40 | HG00142	1
 41 | HG00143	1
 42 | HG00145	1
 43 | HG00146	2
 44 | HG00148	1
 45 | HG00149	1
 46 | HG00150	2
 47 | HG00151	1
 48 | HG00154	2
 49 | HG00155	1
 50 | HG00157	1
 51 | HG00158	2
 52 | HG00159	1
 53 | HG00160	1
 54 | HG00171	2
 55 | HG00173	2
 56 | HG00174	2
 57 | HG00176	2
 58 | HG00177	2
 59 | HG00178	2
 60 | HG00179	2
 61 | HG00180	2
 62 | HG00181	1
 63 | HG00182	1
 64 | HG00183	1
 65 | HG00185	1
 66 | HG00186	1
 67 | HG00187	1
 68 | HG00188	1
 69 | HG00189	1
 70 | HG00231	2
 71 | HG00232	2
 72 | HG00233	2
 73 | HG00234	1
 74 | HG00235	2
 75 | HG00236	2
 76 | HG00238	2
 77 | HG00239	2
 78 | HG00240	2
 79 | HG00242	1
 80 | HG00243	1
 81 | HG00244	1
 82 | HG00245	2
 83 | HG00246	1
 84 | HG00250	2
 85 | HG00251	1
 86 | HG00252	1
 87 | HG00253	2
 88 | HG00255	2
 89 | HG00256	1
 90 | HG00257	2
 91 | HG00258	2
 92 | HG00259	2
 93 | HG00260	1
 94 | HG00261	2
 95 | HG00262	2
 96 | HG00263	2
 97 | HG00264	1
 98 | HG00265	1
 99 | HG00266	2
100 | HG00267	1
101 | HG00268	2
102 | HG00269	2
103 | HG00271	1
104 | HG00272	2
105 | HG00273	1
106 | HG00274	2
107 | HG00275	2
108 | HG00276	2
109 | HG00277	1
110 | HG00278	1
111 | HG00280	1
112 | HG00281	2
113 | HG00282	2
114 | HG00284	1
115 | HG00285	2
116 | HG00306	2
117 | HG00308	1
118 | HG00309	2
119 | HG00310	1
120 | HG00311	1
121 | HG00313	2
122 | HG00315	2
123 | HG00319	2
124 | HG00320	2
125 | HG00321	1
126 | HG00323	2
127 | HG00324	2
128 | HG00325	1
129 | HG00326	2
130 | HG00327	2
131 | HG00328	2
132 | HG00329	1
133 | HG00330	2
134 | HG00331	2
135 | HG00332	2
136 | HG00334	2
137 | HG00335	1
138 | HG00336	1
139 | HG00337	2
140 | HG00338	1
141 | HG00339	2
142 | HG00341	1
143 | HG00342	1
144 | HG00343	2
145 | HG00344	2
146 | HG00345	1
147 | HG00346	2
148 | HG00349	2
149 | HG00350	2
150 | HG00351	1
151 | HG00353	2
152 | HG00355	2
153 | HG00356	2
154 | HG00358	1
155 | HG00360	1
156 | HG00361	2
157 | HG00362	2
158 | HG00364	2
159 | HG00365	2
160 | HG00366	1
161 | HG00367	2
162 | HG00369	1
163 | HG00371	1
164 | HG00372	1
165 | HG00373	2
166 | HG00375	1
167 | HG00376	2
168 | HG00378	2
169 | HG00379	2
170 | HG00380	2
171 | HG00381	2
172 | HG00382	1
173 | HG00383	2
174 | HG00384	2
175 | HG01334	1
176 | HG01789	1
177 | HG01790	2
178 | HG01791	1
179 | HG02215	2
180 | NA06984	1
181 | NA06985	2
182 | NA06986	1
183 | NA06989	2
184 | NA06994	1
185 | NA07037	2
186 | NA07048	1
187 | NA07051	1
188 | NA07056	2
189 | NA07347	1
190 | NA07357	1
191 | NA10847	2
192 | NA10851	1
193 | NA11829	1
194 | NA11830	2
195 | NA11831	1
196 | NA11832	2
197 | NA11840	2
198 | NA11843	1
199 | NA11881	1
200 | NA11892	2
201 | NA11893	1
202 | NA11894	2
203 | NA11918	2
204 | NA11920	2
205 | NA11930	1
206 | NA11931	2
207 | NA11992	1
208 | NA11994	1
209 | NA11995	2
210 | NA12004	2
211 | NA12005	1
212 | NA12006	2
213 | NA12043	1
214 | NA12044	2
215 | NA12045	1
216 | NA12058	2
217 | NA12144	1
218 | NA12154	1
219 | NA12155	1
220 | NA12156	2
221 | NA12234	2
222 | NA12249	2
223 | NA12272	1
224 | NA12273	2
225 | NA12275	2
226 | NA12282	1
227 | NA12283	2
228 | NA12286	1
229 | NA12287	2
230 | NA12340	1
231 | NA12341	2
232 | NA12342	1
233 | NA12347	1
234 | NA12348	2
235 | NA12383	2
236 | NA12399	1
237 | NA12400	2
238 | NA12413	1
239 | NA12489	2
240 | NA12546	1
241 | NA12716	1
242 | NA12717	2
243 | NA12718	2
244 | NA12749	2
245 | NA12750	1
246 | NA12751	2
247 | NA12760	1
248 | NA12761	2
249 | NA12762	1
250 | NA12763	2
251 | NA12775	1
252 | NA12776	2
253 | NA12777	1
254 | NA12778	2
255 | NA12812	1
256 | NA12813	2
257 | NA12814	1
258 | NA12815	2
259 | NA12827	1
260 | NA12829	1
261 | NA12830	2
262 | NA12842	1
263 | NA12843	2
264 | NA12872	1
265 | NA12873	2
266 | NA12874	1
267 | NA12889	1
268 | NA12890	2
269 | NA18486	1
270 | NA18488	2
271 | NA18489	2
272 | NA18498	1
273 | NA18499	2
274 | NA18502	2
275 | NA18505	2
276 | NA18508	2
277 | NA18510	1
278 | NA18511	2
279 | NA18517	2
280 | NA18519	1
281 | NA18520	2
282 | NA18858	2
283 | NA18861	2
284 | NA18867	2
285 | NA18868	1
286 | NA18870	2
287 | NA18873	2
288 | NA18907	2
289 | NA18908	1
290 | NA18909	2
291 | NA18910	1
292 | NA18912	2
293 | NA18916	2
294 | NA18917	1
295 | NA18923	1
296 | NA18933	2
297 | NA18934	1
298 | NA19092	1
299 | NA19093	2
300 | NA19095	2
301 | NA19096	1
302 | NA19098	1
303 | NA19099	2
304 | NA19102	2
305 | NA19107	1
306 | NA19108	2
307 | NA19113	1
308 | NA19114	2
309 | NA19116	2
310 | NA19117	1
311 | NA19118	2
312 | NA19119	1
313 | NA19121	1
314 | NA19129	2
315 | NA19130	1
316 | NA19131	2
317 | NA19137	2
318 | NA19138	1
319 | NA19141	1
320 | NA19143	2
321 | NA19144	1
322 | NA19146	1
323 | NA19147	2
324 | NA19149	2
325 | NA19152	2
326 | NA19153	1
327 | NA19159	2
328 | NA19160	1
329 | NA19171	1
330 | NA19172	2
331 | NA19175	1
332 | NA19184	1
333 | NA19185	2
334 | NA19189	1
335 | NA19190	2
336 | NA19197	2
337 | NA19198	1
338 | NA19200	1
339 | NA19201	2
340 | NA19204	2
341 | NA19206	2
342 | NA19207	1
343 | NA19209	2
344 | NA19210	1
345 | NA19213	1
346 | NA19214	2
347 | NA19222	2
348 | NA19223	1
349 | NA19225	2
350 | NA19235	2
351 | NA19236	1
352 | NA19247	2
353 | NA19248	1
354 | NA19256	1
355 | NA19257	2
356 | NA20502	2
357 | NA20503	2
358 | NA20504	2
359 | NA20505	2
360 | NA20506	2
361 | NA20507	2
362 | NA20508	2
363 | NA20509	1
364 | NA20510	1
365 | NA20512	1
366 | NA20513	1
367 | NA20514	2
368 | NA20515	1
369 | NA20516	1
370 | NA20517	2
371 | NA20518	1
372 | NA20519	1
373 | NA20520	1
374 | NA20521	1
375 | NA20524	1
376 | NA20525	1
377 | NA20527	1
378 | NA20528	1
379 | NA20529	2
380 | NA20530	2
381 | NA20531	2
382 | NA20532	1
383 | NA20534	1
384 | NA20535	2
385 | NA20536	1
386 | NA20538	1
387 | NA20539	1
388 | NA20540	2
389 | NA20541	2
390 | NA20542	2
391 | NA20543	1
392 | NA20544	1
393 | NA20581	1
394 | NA20582	2
395 | NA20585	2
396 | NA20586	1
397 | NA20588	1
398 | NA20589	2
399 | NA20752	1
400 | NA20754	1
401 | NA20756	2
402 | NA20757	2
403 | NA20758	1
404 | NA20759	1
405 | NA20760	2
406 | NA20761	2
407 | NA20765	1
408 | NA20766	2
409 | NA20768	2
410 | NA20769	2
411 | NA20770	1
412 | NA20771	2
413 | NA20772	2
414 | NA20773	2
415 | NA20774	2
416 | NA20778	1
417 | NA20783	1
418 | NA20785	1
419 | NA20786	2
420 | NA20787	1
421 | NA20790	2
422 | NA20792	1
423 | NA20795	2
424 | NA20796	1
425 | NA20797	2
426 | NA20798	1
427 | NA20799	2
428 | NA20800	2
429 | NA20801	1
430 | NA20802	2
431 | NA20803	1
432 | NA20804	2
433 | NA20805	1
434 | NA20806	1
435 | NA20807	2
436 | NA20808	2
437 | NA20809	1
438 | NA20810	1
439 | NA20811	1
440 | NA20812	1
441 | NA20813	2
442 | NA20814	1
443 | NA20815	1
444 | NA20819	2
445 | NA20826	2
446 | NA20828	2
447 | 


--------------------------------------------------------------------------------
/example/data/GEUVADIS.445_samples.expression.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/tensorqtl/812040e15f46797d5246a56339b2a699f1c596a6/example/data/GEUVADIS.445_samples.expression.bed.gz


--------------------------------------------------------------------------------
/example/tensorqtl_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### *cis*- and *trans*-QTL mapping with tensorQTL\n",
  8 |     "\n",
  9 |     "This notebook provides examples for running *cis*- and *trans*-QTL mapping with tensorQTL, using open-access data from the [GEUVADIS](https://www.ebi.ac.uk/arrayexpress/experiments/E-GEUV-1/) project.\n",
 10 |     "\n",
 11 |     "#### Requirements\n",
 12 |     "An environment configured with a GPU and ~50GB of memory.\n",
 13 |     "\n",
 14 |     "#### Test dataset\n",
 15 |     "\n",
 16 |     "*Note: these files are provided for testing/benchmarking purposes only. They do not constitute an official release from the GEUVADIS project, and no quality-control was applied.*\n",
 17 |     "\n",
 18 |     "Genotypes in PLINK2 format (chr18 only), and normalized expression data are available [in this repository](./data/); the full dataset is available at [gs://gtex-resources/test_data/geuvadis](https://console.cloud.google.com/storage/browser/gtex-resources/test_data/geuvadis) ([requester pays](https://cloud.google.com/storage/docs/requester-pays))."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "torch: 2.5.1+cu124 (CUDA 12.4), device: cuda\n",
 31 |       "pandas: 2.2.3\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "import pandas as pd\n",
 37 |     "import torch\n",
 38 |     "import tensorqtl\n",
 39 |     "from tensorqtl import pgen, cis, trans, post\n",
 40 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 41 |     "print(f\"torch: {torch.__version__} (CUDA {torch.version.cuda}), device: {device}\")\n",
 42 |     "print(f\"pandas: {pd.__version__}\")\n",
 43 |     "\n",
 44 |     "# define paths to data\n",
 45 |     "plink_prefix_path = 'data/GEUVADIS.445_samples.GRCh38.20170504.maf01.filtered.nodup.chr18'\n",
 46 |     "expression_bed = 'data/GEUVADIS.445_samples.expression.bed.gz'\n",
 47 |     "covariates_file = 'data/GEUVADIS.445_samples.covariates.txt'\n",
 48 |     "prefix = 'GEUVADIS.445_samples'\n",
 49 |     "\n",
 50 |     "# load phenotypes and covariates\n",
 51 |     "phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)\n",
 52 |     "covariates_df = pd.read_csv(covariates_file, sep='\\t', index_col=0).T\n",
 53 |     "\n",
 54 |     "# PLINK reader for genotypes\n",
 55 |     "pgr = pgen.PgenReader(plink_prefix_path)\n",
 56 |     "genotype_df = pgr.load_genotypes()\n",
 57 |     "variant_df = pgr.variant_df"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### *cis*-QTL: nominal p-values for all variant-phenotype pairs"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "cis-QTL mapping: nominal associations for all variant-phenotype pairs\n",
 77 |       "  * 445 samples\n",
 78 |       "  * 301 phenotypes\n",
 79 |       "  * 26 covariates\n",
 80 |       "  * 367759 variants\n",
 81 |       "  * cis-window: ±1,000,000\n",
 82 |       "  * checking phenotypes: 301/301\n",
 83 |       "  * Computing associations\n",
 84 |       "    Mapping chromosome chr18\n",
 85 |       "    processing phenotype 301/301\n",
 86 |       "    time elapsed: 0.04 min\n",
 87 |       "    * writing output\n",
 88 |       "done.\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "# map all cis-associations (results for each chromosome are written to file)\n",
 94 |     "\n",
 95 |     "# all genes\n",
 96 |     "# cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix, covariates_df=covariates_df)\n",
 97 |     "\n",
 98 |     "# genes on chr18\n",
 99 |     "cis.map_nominal(genotype_df, variant_df,\n",
100 |     "                phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
101 |     "                phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
102 |     "                prefix, covariates_df=covariates_df)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/html": [
113 |        "<div>\n",
114 |        "<style scoped>\n",
115 |        "    .dataframe tbody tr th:only-of-type {\n",
116 |        "        vertical-align: middle;\n",
117 |        "    }\n",
118 |        "\n",
119 |        "    .dataframe tbody tr th {\n",
120 |        "        vertical-align: top;\n",
121 |        "    }\n",
122 |        "\n",
123 |        "    .dataframe thead th {\n",
124 |        "        text-align: right;\n",
125 |        "    }\n",
126 |        "</style>\n",
127 |        "<table border=\"1\" class=\"dataframe\">\n",
128 |        "  <thead>\n",
129 |        "    <tr style=\"text-align: right;\">\n",
130 |        "      <th></th>\n",
131 |        "      <th>phenotype_id</th>\n",
132 |        "      <th>variant_id</th>\n",
133 |        "      <th>start_distance</th>\n",
134 |        "      <th>af</th>\n",
135 |        "      <th>ma_samples</th>\n",
136 |        "      <th>ma_count</th>\n",
137 |        "      <th>pval_nominal</th>\n",
138 |        "      <th>slope</th>\n",
139 |        "      <th>slope_se</th>\n",
140 |        "    </tr>\n",
141 |        "  </thead>\n",
142 |        "  <tbody>\n",
143 |        "    <tr>\n",
144 |        "      <th>0</th>\n",
145 |        "      <td>ENSG00000263006.6</td>\n",
146 |        "      <td>chr18_10644_C_G_b38</td>\n",
147 |        "      <td>-98421</td>\n",
148 |        "      <td>0.016854</td>\n",
149 |        "      <td>15</td>\n",
150 |        "      <td>15</td>\n",
151 |        "      <td>0.580873</td>\n",
152 |        "      <td>-0.117761</td>\n",
153 |        "      <td>0.213125</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>1</th>\n",
157 |        "      <td>ENSG00000263006.6</td>\n",
158 |        "      <td>chr18_10847_C_A_b38</td>\n",
159 |        "      <td>-98218</td>\n",
160 |        "      <td>0.019101</td>\n",
161 |        "      <td>17</td>\n",
162 |        "      <td>17</td>\n",
163 |        "      <td>0.142884</td>\n",
164 |        "      <td>-0.298726</td>\n",
165 |        "      <td>0.203505</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>2</th>\n",
169 |        "      <td>ENSG00000263006.6</td>\n",
170 |        "      <td>chr18_11275_G_A_b38</td>\n",
171 |        "      <td>-97790</td>\n",
172 |        "      <td>0.024719</td>\n",
173 |        "      <td>22</td>\n",
174 |        "      <td>22</td>\n",
175 |        "      <td>0.745231</td>\n",
176 |        "      <td>0.054619</td>\n",
177 |        "      <td>0.167981</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>3</th>\n",
181 |        "      <td>ENSG00000263006.6</td>\n",
182 |        "      <td>chr18_11358_G_A_b38</td>\n",
183 |        "      <td>-97707</td>\n",
184 |        "      <td>0.024719</td>\n",
185 |        "      <td>22</td>\n",
186 |        "      <td>22</td>\n",
187 |        "      <td>0.745231</td>\n",
188 |        "      <td>0.054619</td>\n",
189 |        "      <td>0.167981</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>ENSG00000263006.6</td>\n",
194 |        "      <td>chr18_11445_G_A_b38</td>\n",
195 |        "      <td>-97620</td>\n",
196 |        "      <td>0.023596</td>\n",
197 |        "      <td>21</td>\n",
198 |        "      <td>21</td>\n",
199 |        "      <td>0.603276</td>\n",
200 |        "      <td>0.089378</td>\n",
201 |        "      <td>0.171851</td>\n",
202 |        "    </tr>\n",
203 |        "  </tbody>\n",
204 |        "</table>\n",
205 |        "</div>"
206 |       ],
207 |       "text/plain": [
208 |        "        phenotype_id           variant_id  start_distance        af  \\\n",
209 |        "0  ENSG00000263006.6  chr18_10644_C_G_b38          -98421  0.016854   \n",
210 |        "1  ENSG00000263006.6  chr18_10847_C_A_b38          -98218  0.019101   \n",
211 |        "2  ENSG00000263006.6  chr18_11275_G_A_b38          -97790  0.024719   \n",
212 |        "3  ENSG00000263006.6  chr18_11358_G_A_b38          -97707  0.024719   \n",
213 |        "4  ENSG00000263006.6  chr18_11445_G_A_b38          -97620  0.023596   \n",
214 |        "\n",
215 |        "   ma_samples  ma_count  pval_nominal     slope  slope_se  \n",
216 |        "0          15        15      0.580873 -0.117761  0.213125  \n",
217 |        "1          17        17      0.142884 -0.298726  0.203505  \n",
218 |        "2          22        22      0.745231  0.054619  0.167981  \n",
219 |        "3          22        22      0.745231  0.054619  0.167981  \n",
220 |        "4          21        21      0.603276  0.089378  0.171851  "
221 |       ]
222 |      },
223 |      "execution_count": 3,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "# load results\n",
230 |     "pairs_df = pd.read_parquet(f'{prefix}.cis_qtl_pairs.chr18.parquet')\n",
231 |     "pairs_df.head()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### *cis*-QTL: empirical p-values for phenotypes"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 4,
244 |    "metadata": {},
245 |    "outputs": [
246 |     {
247 |      "name": "stdout",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "cis-QTL mapping: empirical p-values for phenotypes\n",
251 |       "  * 445 samples\n",
252 |       "  * 301 phenotypes\n",
253 |       "  * 26 covariates\n",
254 |       "  * 367759 variants\n",
255 |       "  * cis-window: ±1,000,000\n",
256 |       "  * using seed 123456\n",
257 |       "  * checking phenotypes: 301/301\n",
258 |       "  * computing permutations\n",
259 |       "    processing phenotype 301/301\n",
260 |       "  Time elapsed: 0.31 min\n",
261 |       "done.\n",
262 |       "Computing q-values\n",
263 |       "  * Number of phenotypes tested: 301\n",
264 |       "  * Correlation between Beta-approximated and empirical p-values: 1.0000\n",
265 |       "  * Calculating q-values with lambda = 0.850\n",
266 |       "  * Proportion of significant phenotypes (1-pi0): 0.76\n",
267 |       "  * QTL phenotypes @ FDR 0.05: 205\n",
268 |       "  * min p-value threshold @ FDR 0.05: 0.135284\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "# all genes\n",
274 |     "# cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df)\n",
275 |     "\n",
276 |     "# genes on chr18\n",
277 |     "cis_df = cis.map_cis(genotype_df, variant_df, \n",
278 |     "                     phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
279 |     "                     phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],\n",
280 |     "                     covariates_df=covariates_df, seed=123456)\n",
281 |     "# compute q-values (in practice, this must be run on all genes, not a subset)\n",
282 |     "post.calculate_qvalues(cis_df, fdr=0.05, qvalue_lambda=0.85)"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 5,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/html": [
293 |        "<div>\n",
294 |        "<style scoped>\n",
295 |        "    .dataframe tbody tr th:only-of-type {\n",
296 |        "        vertical-align: middle;\n",
297 |        "    }\n",
298 |        "\n",
299 |        "    .dataframe tbody tr th {\n",
300 |        "        vertical-align: top;\n",
301 |        "    }\n",
302 |        "\n",
303 |        "    .dataframe thead th {\n",
304 |        "        text-align: right;\n",
305 |        "    }\n",
306 |        "</style>\n",
307 |        "<table border=\"1\" class=\"dataframe\">\n",
308 |        "  <thead>\n",
309 |        "    <tr style=\"text-align: right;\">\n",
310 |        "      <th></th>\n",
311 |        "      <th>num_var</th>\n",
312 |        "      <th>beta_shape1</th>\n",
313 |        "      <th>beta_shape2</th>\n",
314 |        "      <th>true_df</th>\n",
315 |        "      <th>pval_true_df</th>\n",
316 |        "      <th>variant_id</th>\n",
317 |        "      <th>start_distance</th>\n",
318 |        "      <th>end_distance</th>\n",
319 |        "      <th>ma_samples</th>\n",
320 |        "      <th>ma_count</th>\n",
321 |        "      <th>af</th>\n",
322 |        "      <th>pval_nominal</th>\n",
323 |        "      <th>slope</th>\n",
324 |        "      <th>slope_se</th>\n",
325 |        "      <th>pval_perm</th>\n",
326 |        "      <th>pval_beta</th>\n",
327 |        "      <th>qval</th>\n",
328 |        "      <th>pval_nominal_threshold</th>\n",
329 |        "    </tr>\n",
330 |        "    <tr>\n",
331 |        "      <th>phenotype_id</th>\n",
332 |        "      <th></th>\n",
333 |        "      <th></th>\n",
334 |        "      <th></th>\n",
335 |        "      <th></th>\n",
336 |        "      <th></th>\n",
337 |        "      <th></th>\n",
338 |        "      <th></th>\n",
339 |        "      <th></th>\n",
340 |        "      <th></th>\n",
341 |        "      <th></th>\n",
342 |        "      <th></th>\n",
343 |        "      <th></th>\n",
344 |        "      <th></th>\n",
345 |        "      <th></th>\n",
346 |        "      <th></th>\n",
347 |        "      <th></th>\n",
348 |        "      <th></th>\n",
349 |        "      <th></th>\n",
350 |        "    </tr>\n",
351 |        "  </thead>\n",
352 |        "  <tbody>\n",
353 |        "    <tr>\n",
354 |        "      <th>ENSG00000263006.6</th>\n",
355 |        "      <td>6120</td>\n",
356 |        "      <td>1.038811</td>\n",
357 |        "      <td>1138.434082</td>\n",
358 |        "      <td>374.660400</td>\n",
359 |        "      <td>8.220950e-40</td>\n",
360 |        "      <td>chr18_112535_G_A_b38</td>\n",
361 |        "      <td>3470</td>\n",
362 |        "      <td>3470</td>\n",
363 |        "      <td>212</td>\n",
364 |        "      <td>251</td>\n",
365 |        "      <td>0.282022</td>\n",
366 |        "      <td>4.050344e-44</td>\n",
367 |        "      <td>0.726425</td>\n",
368 |        "      <td>0.046171</td>\n",
369 |        "      <td>0.000100</td>\n",
370 |        "      <td>3.677735e-38</td>\n",
371 |        "      <td>2.697006e-37</td>\n",
372 |        "      <td>0.000141</td>\n",
373 |        "    </tr>\n",
374 |        "    <tr>\n",
375 |        "      <th>ENSG00000101557.14</th>\n",
376 |        "      <td>6355</td>\n",
377 |        "      <td>1.032237</td>\n",
378 |        "      <td>1076.303223</td>\n",
379 |        "      <td>370.176422</td>\n",
380 |        "      <td>5.632806e-11</td>\n",
381 |        "      <td>chr18_210698_T_C_b38</td>\n",
382 |        "      <td>52315</td>\n",
383 |        "      <td>52315</td>\n",
384 |        "      <td>192</td>\n",
385 |        "      <td>222</td>\n",
386 |        "      <td>0.249438</td>\n",
387 |        "      <td>3.505411e-12</td>\n",
388 |        "      <td>-0.191712</td>\n",
389 |        "      <td>0.026749</td>\n",
390 |        "      <td>0.000100</td>\n",
391 |        "      <td>3.498951e-08</td>\n",
392 |        "      <td>3.563747e-08</td>\n",
393 |        "      <td>0.000146</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>ENSG00000079134.11</th>\n",
397 |        "      <td>6921</td>\n",
398 |        "      <td>1.047219</td>\n",
399 |        "      <td>1155.660156</td>\n",
400 |        "      <td>370.356049</td>\n",
401 |        "      <td>3.888738e-08</td>\n",
402 |        "      <td>chr18_243547_T_A_b38</td>\n",
403 |        "      <td>-24503</td>\n",
404 |        "      <td>-24503</td>\n",
405 |        "      <td>293</td>\n",
406 |        "      <td>383</td>\n",
407 |        "      <td>0.430337</td>\n",
408 |        "      <td>5.473709e-09</td>\n",
409 |        "      <td>-0.122720</td>\n",
410 |        "      <td>0.020602</td>\n",
411 |        "      <td>0.000100</td>\n",
412 |        "      <td>2.743975e-05</td>\n",
413 |        "      <td>1.916427e-05</td>\n",
414 |        "      <td>0.000141</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>ENSG00000263884.1</th>\n",
418 |        "      <td>6921</td>\n",
419 |        "      <td>1.039806</td>\n",
420 |        "      <td>1152.501587</td>\n",
421 |        "      <td>369.873505</td>\n",
422 |        "      <td>7.681884e-04</td>\n",
423 |        "      <td>chr18_584440_G_C_b38</td>\n",
424 |        "      <td>316292</td>\n",
425 |        "      <td>316292</td>\n",
426 |        "      <td>81</td>\n",
427 |        "      <td>88</td>\n",
428 |        "      <td>0.098876</td>\n",
429 |        "      <td>3.540399e-04</td>\n",
430 |        "      <td>-0.330811</td>\n",
431 |        "      <td>0.091845</td>\n",
432 |        "      <td>0.574843</td>\n",
433 |        "      <td>5.695498e-01</td>\n",
434 |        "      <td>1.577698e-01</td>\n",
435 |        "      <td>0.000139</td>\n",
436 |        "    </tr>\n",
437 |        "    <tr>\n",
438 |        "      <th>ENSG00000158270.11</th>\n",
439 |        "      <td>8134</td>\n",
440 |        "      <td>1.054919</td>\n",
441 |        "      <td>1277.927246</td>\n",
442 |        "      <td>369.469086</td>\n",
443 |        "      <td>2.516529e-09</td>\n",
444 |        "      <td>chr18_519222_C_T_b38</td>\n",
445 |        "      <td>18500</td>\n",
446 |        "      <td>18500</td>\n",
447 |        "      <td>108</td>\n",
448 |        "      <td>115</td>\n",
449 |        "      <td>0.129213</td>\n",
450 |        "      <td>2.409717e-10</td>\n",
451 |        "      <td>-0.388277</td>\n",
452 |        "      <td>0.059808</td>\n",
453 |        "      <td>0.000100</td>\n",
454 |        "      <td>1.567348e-06</td>\n",
455 |        "      <td>1.321136e-06</td>\n",
456 |        "      <td>0.000130</td>\n",
457 |        "    </tr>\n",
458 |        "  </tbody>\n",
459 |        "</table>\n",
460 |        "</div>"
461 |       ],
462 |       "text/plain": [
463 |        "                    num_var  beta_shape1  beta_shape2     true_df  \\\n",
464 |        "phenotype_id                                                        \n",
465 |        "ENSG00000263006.6      6120     1.038811  1138.434082  374.660400   \n",
466 |        "ENSG00000101557.14     6355     1.032237  1076.303223  370.176422   \n",
467 |        "ENSG00000079134.11     6921     1.047219  1155.660156  370.356049   \n",
468 |        "ENSG00000263884.1      6921     1.039806  1152.501587  369.873505   \n",
469 |        "ENSG00000158270.11     8134     1.054919  1277.927246  369.469086   \n",
470 |        "\n",
471 |        "                    pval_true_df            variant_id  start_distance  \\\n",
472 |        "phenotype_id                                                             \n",
473 |        "ENSG00000263006.6   8.220950e-40  chr18_112535_G_A_b38            3470   \n",
474 |        "ENSG00000101557.14  5.632806e-11  chr18_210698_T_C_b38           52315   \n",
475 |        "ENSG00000079134.11  3.888738e-08  chr18_243547_T_A_b38          -24503   \n",
476 |        "ENSG00000263884.1   7.681884e-04  chr18_584440_G_C_b38          316292   \n",
477 |        "ENSG00000158270.11  2.516529e-09  chr18_519222_C_T_b38           18500   \n",
478 |        "\n",
479 |        "                    end_distance  ma_samples  ma_count        af  \\\n",
480 |        "phenotype_id                                                       \n",
481 |        "ENSG00000263006.6           3470         212       251  0.282022   \n",
482 |        "ENSG00000101557.14         52315         192       222  0.249438   \n",
483 |        "ENSG00000079134.11        -24503         293       383  0.430337   \n",
484 |        "ENSG00000263884.1         316292          81        88  0.098876   \n",
485 |        "ENSG00000158270.11         18500         108       115  0.129213   \n",
486 |        "\n",
487 |        "                    pval_nominal     slope  slope_se  pval_perm     pval_beta  \\\n",
488 |        "phenotype_id                                                                    \n",
489 |        "ENSG00000263006.6   4.050344e-44  0.726425  0.046171   0.000100  3.677735e-38   \n",
490 |        "ENSG00000101557.14  3.505411e-12 -0.191712  0.026749   0.000100  3.498951e-08   \n",
491 |        "ENSG00000079134.11  5.473709e-09 -0.122720  0.020602   0.000100  2.743975e-05   \n",
492 |        "ENSG00000263884.1   3.540399e-04 -0.330811  0.091845   0.574843  5.695498e-01   \n",
493 |        "ENSG00000158270.11  2.409717e-10 -0.388277  0.059808   0.000100  1.567348e-06   \n",
494 |        "\n",
495 |        "                            qval  pval_nominal_threshold  \n",
496 |        "phenotype_id                                              \n",
497 |        "ENSG00000263006.6   2.697006e-37                0.000141  \n",
498 |        "ENSG00000101557.14  3.563747e-08                0.000146  \n",
499 |        "ENSG00000079134.11  1.916427e-05                0.000141  \n",
500 |        "ENSG00000263884.1   1.577698e-01                0.000139  \n",
501 |        "ENSG00000158270.11  1.321136e-06                0.000130  "
502 |       ]
503 |      },
504 |      "execution_count": 5,
505 |      "metadata": {},
506 |      "output_type": "execute_result"
507 |     }
508 |    ],
509 |    "source": [
510 |     "cis_df.head()"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "### *trans*-QTL mapping"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 6,
523 |    "metadata": {},
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "trans-QTL mapping\n",
530 |       "  * 445 samples\n",
531 |       "  * 19836 phenotypes\n",
532 |       "  * 26 covariates\n",
533 |       "  * 367759 variants\n",
534 |       "    processing batch 37/37\n",
535 |       "    elapsed time: 0.02 min\n",
536 |       "  * 210838 variants passed MAF >= 0.05 filtering\n",
537 |       "done.\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "# run mapping\n",
543 |     "# to limit output size, only associations with p-value <= 1e-5 are returned\n",
544 |     "trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, batch_size=10000,\n",
545 |     "                           return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05)"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": 7,
551 |    "metadata": {},
552 |    "outputs": [],
553 |    "source": [
554 |     "# remove cis-associations\n",
555 |     "trans_df = trans.filter_cis(trans_df, phenotype_pos_df, variant_df, window=5000000)"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 8,
561 |    "metadata": {},
562 |    "outputs": [
563 |     {
564 |      "data": {
565 |       "text/html": [
566 |        "<div>\n",
567 |        "<style scoped>\n",
568 |        "    .dataframe tbody tr th:only-of-type {\n",
569 |        "        vertical-align: middle;\n",
570 |        "    }\n",
571 |        "\n",
572 |        "    .dataframe tbody tr th {\n",
573 |        "        vertical-align: top;\n",
574 |        "    }\n",
575 |        "\n",
576 |        "    .dataframe thead th {\n",
577 |        "        text-align: right;\n",
578 |        "    }\n",
579 |        "</style>\n",
580 |        "<table border=\"1\" class=\"dataframe\">\n",
581 |        "  <thead>\n",
582 |        "    <tr style=\"text-align: right;\">\n",
583 |        "      <th></th>\n",
584 |        "      <th>variant_id</th>\n",
585 |        "      <th>phenotype_id</th>\n",
586 |        "      <th>pval</th>\n",
587 |        "      <th>b</th>\n",
588 |        "      <th>b_se</th>\n",
589 |        "      <th>af</th>\n",
590 |        "    </tr>\n",
591 |        "  </thead>\n",
592 |        "  <tbody>\n",
593 |        "    <tr>\n",
594 |        "      <th>1</th>\n",
595 |        "      <td>chr18_20683_A_G_b38</td>\n",
596 |        "      <td>ENSG00000163900.10</td>\n",
597 |        "      <td>5.012229e-06</td>\n",
598 |        "      <td>0.209540</td>\n",
599 |        "      <td>0.045309</td>\n",
600 |        "      <td>0.179775</td>\n",
601 |        "    </tr>\n",
602 |        "    <tr>\n",
603 |        "      <th>3</th>\n",
604 |        "      <td>chr18_27346_G_T_b38</td>\n",
605 |        "      <td>ENSG00000164088.17</td>\n",
606 |        "      <td>7.309937e-06</td>\n",
607 |        "      <td>-0.265623</td>\n",
608 |        "      <td>0.058483</td>\n",
609 |        "      <td>0.123596</td>\n",
610 |        "    </tr>\n",
611 |        "    <tr>\n",
612 |        "      <th>11</th>\n",
613 |        "      <td>chr18_43564_G_A_b38</td>\n",
614 |        "      <td>ENSG00000198162.12</td>\n",
615 |        "      <td>1.314060e-07</td>\n",
616 |        "      <td>-0.202922</td>\n",
617 |        "      <td>0.037792</td>\n",
618 |        "      <td>0.093258</td>\n",
619 |        "    </tr>\n",
620 |        "    <tr>\n",
621 |        "      <th>12</th>\n",
622 |        "      <td>chr18_43564_G_A_b38</td>\n",
623 |        "      <td>ENSG00000261098.1</td>\n",
624 |        "      <td>8.494569e-06</td>\n",
625 |        "      <td>-0.421968</td>\n",
626 |        "      <td>0.093594</td>\n",
627 |        "      <td>0.093258</td>\n",
628 |        "    </tr>\n",
629 |        "    <tr>\n",
630 |        "      <th>13</th>\n",
631 |        "      <td>chr18_43611_C_T_b38</td>\n",
632 |        "      <td>ENSG00000265972.5</td>\n",
633 |        "      <td>1.448981e-06</td>\n",
634 |        "      <td>-0.272301</td>\n",
635 |        "      <td>0.055697</td>\n",
636 |        "      <td>0.135955</td>\n",
637 |        "    </tr>\n",
638 |        "  </tbody>\n",
639 |        "</table>\n",
640 |        "</div>"
641 |       ],
642 |       "text/plain": [
643 |        "             variant_id        phenotype_id          pval         b      b_se  \\\n",
644 |        "1   chr18_20683_A_G_b38  ENSG00000163900.10  5.012229e-06  0.209540  0.045309   \n",
645 |        "3   chr18_27346_G_T_b38  ENSG00000164088.17  7.309937e-06 -0.265623  0.058483   \n",
646 |        "11  chr18_43564_G_A_b38  ENSG00000198162.12  1.314060e-07 -0.202922  0.037792   \n",
647 |        "12  chr18_43564_G_A_b38   ENSG00000261098.1  8.494569e-06 -0.421968  0.093594   \n",
648 |        "13  chr18_43611_C_T_b38   ENSG00000265972.5  1.448981e-06 -0.272301  0.055697   \n",
649 |        "\n",
650 |        "          af  \n",
651 |        "1   0.179775  \n",
652 |        "3   0.123596  \n",
653 |        "11  0.093258  \n",
654 |        "12  0.093258  \n",
655 |        "13  0.135955  "
656 |       ]
657 |      },
658 |      "execution_count": 8,
659 |      "metadata": {},
660 |      "output_type": "execute_result"
661 |     }
662 |    ],
663 |    "source": [
664 |     "trans_df.head()"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": null,
670 |    "metadata": {},
671 |    "outputs": [],
672 |    "source": []
673 |   }
674 |  ],
675 |  "metadata": {
676 |   "kernelspec": {
677 |    "display_name": "Python 3 (ipykernel)",
678 |    "language": "python",
679 |    "name": "python3"
680 |   },
681 |   "language_info": {
682 |    "codemirror_mode": {
683 |     "name": "ipython",
684 |     "version": 3
685 |    },
686 |    "file_extension": ".py",
687 |    "mimetype": "text/x-python",
688 |    "name": "python",
689 |    "nbconvert_exporter": "python",
690 |    "pygments_lexer": "ipython3",
691 |    "version": "3.11.9"
692 |   }
693 |  },
694 |  "nbformat": 4,
695 |  "nbformat_minor": 4
696 | }
697 | 


--------------------------------------------------------------------------------
/install/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ### Setup CUDA drivers and PyTorch on GCP
 2 | 
 3 | Launch a new instance configured with Ubuntu 22.04 LTS and a GPU, clone this repository, and run the following:
 4 | #### Install CUDA
 5 | ```bash
 6 | sudo ./install_cuda.sh
 7 | sudo reboot
 8 | # verify
 9 | nvidia-smi
10 | ```
11 | 
12 | #### Install R
13 | Required for computing q-values. Follow instructions [here](https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-22-04), then install the 'qvalue' package with
14 | ```bash
15 | if (!require("BiocManager", quietly = TRUE))
16 |     install.packages("BiocManager")
17 | BiocManager::install("qvalue")
18 | ```
19 | 
20 | #### Install Python 3
21 | Using a [conda](https://github.com/conda-forge/miniforge) environment is recommended. The `tensorqtl_env.yml` configuration contains all required packages, including `torch` and `tensorqtl`.
22 | ```bash
23 | mamba env create -f tensorqtl_env.yml
24 | conda activate tensorqtl
25 | 
26 | # verify
27 | python -c "import torch; print(torch.__version__); print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))"
28 | 
29 | # this should print something like
30 | # 2.1.2+cu121
31 | # CUDA available: True (Tesla P100-PCIE-16GB)
32 | ```
33 | 
34 | #### Install rmate (optional)
35 | ```bash
36 | sudo apt install -y ruby
37 | mkdir ~/bin
38 | curl -Lo ~/bin/rmate https://raw.githubusercontent.com/textmate/rmate/master/bin/rmate
39 | chmod a+x ~/bin/rmate
40 | echo 'export RMATE_PORT=${rmate_port}' >> ~/.bashrc
41 | ```
42 | 


--------------------------------------------------------------------------------
/install/install_cuda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install script for PyTorch 2.1.2 + CUDA 12.1 on Ubuntu 22.04
 3 | # for torch, see https://pytorch.org/get-started/locally/
 4 | # for CUDA drivers, see https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local
 5 | # for other versions, see https://developer.nvidia.com/cuda-toolkit-archive
 6 | 
 7 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
 8 | sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
 9 | wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
10 | sudo dpkg -i cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
11 | sudo cp /var/cuda-repo-ubuntu2204-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
12 | sudo apt-get update
13 | sudo apt-get -y install cuda
14 | rm cuda-repo-ubuntu2204-12-1-local_12.1.0-530.30.02-1_amd64.deb
15 | 
16 | # test
17 | python -c "import torch; print('CUDA available: {} ({})'.format(torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device())))"
18 | 


--------------------------------------------------------------------------------
/install/tensorqtl_env.yml:
--------------------------------------------------------------------------------
 1 | name: tensorqtl
 2 | dependencies:
 3 | - python=3.11
 4 | - pip
 5 | - pip:
 6 |   - numpy
 7 |   - pandas
 8 |   - pandas-plink
 9 |   - Pgenlib>=0.90.1
10 |   - pyarrow
11 |   - qtl
12 |   - rpy2
13 |   - scipy
14 |   - torch
15 |   - tensorqtl
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.setuptools]
 6 | packages = ["tensorqtl"]
 7 | 
 8 | [project]
 9 | name = "tensorqtl"
10 | version = "1.0.10"
11 | dependencies = [
12 |     "numpy",
13 |     "pandas",
14 |     "Pgenlib>=0.90.1",
15 |     "qtl",
16 |     "scipy",
17 |     "torch",
18 | ]
19 | authors = [
20 |     {name = "Francois Aguet", email = "francois@broadinstitute.org"}
21 | ]
22 | maintainers = [
23 |     {name = "Francois Aguet", email = "francois@broadinstitute.org"}
24 | ]
25 | description = "GPU-accelerated QTL mapper"
26 | readme = "README.md"
27 | license = {file = "LICENSE"}
28 | keywords = ["Quantitative trait loci"]
29 | classifiers = [
30 |     "Development Status :: 4 - Beta",
31 |     "Programming Language :: Python :: 3",
32 |     "Intended Audience :: Science/Research",
33 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
34 | ]
35 | 
36 | [project.urls]
37 | Repository = "https://github.com/broadinstitute/tensorqtl.git"
38 | 


--------------------------------------------------------------------------------
/tensorqtl/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | from .tensorqtl import *
3 | 
4 | __version__ = importlib.metadata.version(__name__)
5 | 


--------------------------------------------------------------------------------
/tensorqtl/__main__.py:
--------------------------------------------------------------------------------
1 | import tensorqtl
2 | tensorqtl.main()
3 | 


--------------------------------------------------------------------------------
/tensorqtl/coloc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy.stats as stats
  4 | import torch
  5 | import os
  6 | import time
  7 | import sys
  8 | sys.path.insert(1, os.path.dirname(__file__))
  9 | import genotypeio, eigenmt
 10 | from core import *
 11 | 
 12 | 
 13 | def logsumexp(x, dim=0):
 14 |     mmax,_ = torch.max(x, dim=dim, keepdim=True)
 15 |     return mmax + (x-mmax).exp().sum(dim, keepdim=True).log()
 16 | 
 17 | 
 18 | def logdiff(x, y, dim=0):
 19 |     xmax,_ = torch.max(x, dim=dim, keepdim=True)
 20 |     ymax,_ = torch.max(y, dim=dim, keepdim=True)
 21 |     mmax = torch.max(xmax, ymax)
 22 |     return mmax + ((x - mmax).exp() - (y - mmax).exp()).log()
 23 | 
 24 | 
 25 | def coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t,
 26 |           residualizer1=None, residualizer2=None, mode='beta',
 27 |           p1=1e-4, p2=1e-4, p12=1e-5):
 28 |     """COLOC from summary statistics (either beta/sds or p-values and MAF)"""
 29 | 
 30 |     assert phenotype1_t.dim() == 1
 31 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 32 | 
 33 |     # phenotype 1
 34 |     if mode == 'beta':
 35 |         r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr(
 36 |             genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=True)
 37 |         r_nominal_t = r_nominal_t.squeeze()
 38 |         var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1)
 39 |     else:
 40 |         r_nominal_t = calculate_corr(
 41 |             genotypes1_t, phenotype1_t.reshape(1,-1), residualizer1, return_var=False).squeeze()
 42 |     r2_nominal_t = r_nominal_t.double().pow(2)
 43 | 
 44 |     if residualizer1 is not None:
 45 |         dof = residualizer1.dof
 46 |     else:
 47 |         dof = phenotype1_t.shape[0] - 2
 48 | 
 49 |     if mode == 'beta':
 50 |         tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t)
 51 |         beta2_t = r2_nominal_t * var_ratio_t.squeeze()
 52 |         beta_var_t = beta2_t / tstat2_t
 53 |         var_prior = 0.0225 * phenotype_var_t
 54 |         r = var_prior / (var_prior + beta_var_t)
 55 |         l1 = 0.5 * ((1 - r).log() + r*tstat2_t)
 56 |     else:
 57 |         # compute p-values and z-score to match COLOC results exactly
 58 |         # (instead of directly using t-statistic)
 59 |         tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t))
 60 |         p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof)  # 2 dropped since canceled in isf
 61 |         maf_t = calculate_maf(genotypes1_t)
 62 |         N = phenotype1_t.shape[0]
 63 |         v = 1 / (2 * N * maf_t * (1 - maf_t))
 64 |         z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device)
 65 |         r = 0.0225 / (0.0225 + v)
 66 |         l1 = 0.5 * ((1 - r).log() + r*z2_t)
 67 | 
 68 |     # phenotype 2
 69 |     if phenotype2_t.dim() == 1:
 70 |         num_phenotypes = 1
 71 |         num_samples = phenotype2_t.shape[0]
 72 |         phenotype2_t = phenotype2_t.reshape(1,-1)
 73 |     else:
 74 |         num_phenotypes, num_samples = phenotype2_t.shape
 75 | 
 76 |     if mode == 'beta':
 77 |         r_nominal_t, genotype_var_t, phenotype_var_t = calculate_corr(
 78 |             genotypes2_t, phenotype2_t, residualizer2, return_var=True)
 79 |         r_nominal_t = r_nominal_t.squeeze()
 80 |         var_ratio_t = phenotype_var_t.reshape(1,-1) / genotype_var_t.reshape(-1,1)
 81 |     else:
 82 |         r_nominal_t = calculate_corr(genotypes2_t, phenotype2_t, residualizer2, return_var=False).squeeze()
 83 |     r2_nominal_t = r_nominal_t.double().pow(2)
 84 | 
 85 |     if residualizer2 is not None:
 86 |         dof = residualizer2.dof
 87 |     else:
 88 |         dof = num_samples - 2
 89 | 
 90 |     if mode == 'beta':
 91 |         tstat2_t = r2_nominal_t * dof / (1 - r2_nominal_t)
 92 |         beta2_t = r2_nominal_t * var_ratio_t.squeeze()
 93 |         beta_var_t = beta2_t / tstat2_t
 94 |         var_prior = 0.0225 * phenotype_var_t
 95 |         r = var_prior / (var_prior + beta_var_t)
 96 |         l2 = 0.5 * ((1 - r).log() + r*tstat2_t)
 97 |     else:
 98 |         tstat_t = r_nominal_t * torch.sqrt(dof / (1 - r2_nominal_t))
 99 |         p = stats.t.cdf(-np.abs(tstat_t.cpu().numpy()), dof)
100 |         maf_t = calculate_maf(genotypes2_t)
101 |         v = 1 / (2 * num_samples * maf_t * (1 - maf_t))
102 |         z2_t = torch.Tensor(stats.norm.isf(p)**2).to(device)
103 |         r = 0.0225 / (0.0225 + v)
104 |         if num_phenotypes > 1:
105 |             r = r.reshape(-1,1)
106 |         l2 = 0.5 * ((1 - r).log() + r*z2_t)
107 | 
108 |     if num_phenotypes > 1:
109 |         lsum = l1.reshape(-1,1) + l2
110 |         lh0_abf = torch.zeros([1, num_phenotypes]).to(device)
111 |         lh1_abf = np.log(p1) + logsumexp(l1).repeat([1, num_phenotypes])
112 |     else:
113 |         lsum = l1 + l2
114 |         lh0_abf = torch.zeros([1]).to(device)
115 |         lh1_abf = np.log(p1) + logsumexp(l1)
116 |     lh2_abf = np.log(p2) + logsumexp(l2)
117 |     lh3_abf = np.log(p1) + np.log(p2) + logdiff(logsumexp(l1) + logsumexp(l2), logsumexp(lsum))
118 |     lh4_abf = np.log(p12) + logsumexp(lsum)
119 |     all_abf = torch.cat([lh0_abf, lh1_abf, lh2_abf, lh3_abf, lh4_abf])
120 |     return (all_abf - logsumexp(all_abf, dim=0)).exp().squeeze()
121 | 
122 | 
123 | def run_pairs(genotype_df, variant_df, phenotype1_df, phenotype2_df, phenotype_pos_df,
124 |               covariates1_df=None, covariates2_df=None, p1=1e-4, p2=1e-4, p12=1e-5, mode='beta',
125 |               maf_threshold=0, window=1000000, batch_size=10000, logger=None, verbose=True):
126 |     """Compute COLOC for all phenotype pairs"""
127 | 
128 |     assert np.all(phenotype1_df.index == phenotype2_df.index)
129 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
130 | 
131 |     if logger is None:
132 |         logger = SimpleLogger()
133 | 
134 |     logger.write('Computing COLOC for all pairs of phenotypes')
135 |     logger.write(f'  * {phenotype1_df.shape[0]} phenotypes')
136 |     logger.write(f'  * phenotype group 1: {phenotype1_df.shape[1]} samples')
137 |     logger.write(f'  * phenotype group 2: {phenotype2_df.shape[1]} samples')
138 | 
139 |     if covariates1_df is not None:
140 |         assert np.all(phenotype1_df.columns == covariates1_df.index)
141 |         logger.write(f'  * phenotype group 1: {covariates1_df.shape[1]} covariates')
142 |         residualizer1 = Residualizer(torch.tensor(covariates1_df.values, dtype=torch.float32).to(device))
143 |     else:
144 |         residualizer1 = None
145 | 
146 |     if covariates2_df is not None:
147 |         assert np.all(phenotype2_df.columns == covariates2_df.index)
148 |         logger.write(f'  * phenotype group 2: {covariates2_df.shape[1]} covariates')
149 |         residualizer2 = Residualizer(torch.tensor(covariates2_df.values, dtype=torch.float32).to(device))
150 |     else:
151 |         residualizer2 = None
152 | 
153 |     if maf_threshold > 0:
154 |         logger.write(f'  * applying in-sample {maf_threshold} MAF filter (in at least one cohort)')
155 | 
156 |     genotype1_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype1_df.columns])
157 |     genotype1_ix_t = torch.from_numpy(genotype1_ix).to(device)
158 |     genotype2_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype2_df.columns])
159 |     genotype2_ix_t = torch.from_numpy(genotype2_ix).to(device)
160 | 
161 |     igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype1_df, phenotype_pos_df, window=window)
162 |     coloc_df = []
163 |     start_time = time.time()
164 |     logger.write('  * Computing pairwise colocalization')
165 |     for phenotype1, genotypes, genotype_range, phenotype_id in igc.generate_data(verbose=verbose):
166 |         phenotype2 = phenotype2_df.loc[phenotype_id]
167 | 
168 |         # copy to GPU
169 |         phenotype1_t = torch.tensor(phenotype1, dtype=torch.float).to(device)
170 |         phenotype2_t = torch.tensor(phenotype2, dtype=torch.float).to(device)
171 |         genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
172 |         genotypes1_t = genotypes_t[:,genotype1_ix_t]
173 |         genotypes2_t = genotypes_t[:,genotype2_ix_t]
174 |         del genotypes_t
175 | 
176 |         impute_mean(genotypes1_t)
177 |         impute_mean(genotypes2_t)
178 |         # filter monomorphic sites
179 |         m = ((genotypes1_t==0).all(1) | (genotypes1_t==1).all(1) | (genotypes1_t==2).all(1) |
180 |              (genotypes2_t==0).all(1) | (genotypes2_t==1).all(1) | (genotypes2_t==2).all(1))
181 |         genotypes1_t = genotypes1_t[~m]
182 |         genotypes2_t = genotypes2_t[~m]
183 | 
184 |         if maf_threshold > 0:
185 |             maf1_t = calculate_maf(genotypes1_t)
186 |             maf2_t = calculate_maf(genotypes2_t)
187 |             mask_t = (maf1_t >= maf_threshold) | (maf2_t >= maf_threshold)
188 |             genotypes1_t = genotypes1_t[mask_t]
189 |             genotypes2_t = genotypes2_t[mask_t]
190 | 
191 |         coloc_t = coloc(genotypes1_t, genotypes2_t, phenotype1_t, phenotype2_t,
192 |                         residualizer1=residualizer1, residualizer2=residualizer2,
193 |                         p1=p1, p2=p2, p12=p12, mode=mode)
194 |         coloc_df.append(coloc_t.cpu().numpy())
195 |     logger.write('    time elapsed: {:.2f} min'.format((time.time()-start_time)/60))
196 |     coloc_df = pd.DataFrame(coloc_df, columns=[f'pp_h{i}_abf' for i in range(5)], index=phenotype1_df.index)
197 |     logger.write('done.')
198 |     return coloc_df
199 | 


--------------------------------------------------------------------------------
/tensorqtl/core.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import pandas as pd
  4 | import scipy.stats as stats
  5 | import scipy.optimize
  6 | from scipy.special import loggamma
  7 | import sys
  8 | import re
  9 | import subprocess
 10 | 
 11 | # check R
 12 | has_rpy2 = False
 13 | try:
 14 |     subprocess.check_call('which R', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 15 |     subprocess.check_call("R -e 'library(qvalue)'", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 16 |     import rpy2
 17 |     import rfunc
 18 |     has_rpy2 = True
 19 | except:
 20 |     print("Warning: 'rfunc' cannot be imported. R with the 'qvalue' library and the 'rpy2' Python package are needed to compute q-values.")
 21 | 
 22 | 
 23 | output_dtype_dict = {
 24 |     'num_var':np.int32,
 25 |     'beta_shape1':np.float32,
 26 |     'beta_shape2':np.float32,
 27 |     'true_df':np.float32,
 28 |     'pval_true_df':np.float64,
 29 |     'variant_id':str,
 30 |     'start_distance':np.int32,
 31 |     'end_distance':np.int32,
 32 |     'ma_samples':np.int32,
 33 |     'ma_count':np.int32,
 34 |     'af':np.float32,
 35 |     'pval_nominal':np.float64,
 36 |     'slope':np.float32,
 37 |     'slope_se':np.float32,
 38 |     'pval_perm':np.float64,
 39 |     'pval_beta':np.float64,
 40 | }
 41 | 
 42 | 
 43 | class SimpleLogger(object):
 44 |     def __init__(self, logfile=None, verbose=True):
 45 |         self.console = sys.stdout
 46 |         self.verbose = verbose
 47 |         if logfile is not None:
 48 |             self.log = open(logfile, 'w')
 49 |         else:
 50 |             self.log = None
 51 | 
 52 |     def write(self, message):
 53 |         if self.verbose:
 54 |             self.console.write(message+'\n')
 55 |         if self.log is not None:
 56 |             self.log.write(message+'\n')
 57 |             self.log.flush()
 58 | 
 59 | #------------------------------------------------------------------------------
 60 | #  Core classes/functions for mapping associations on GPU
 61 | #------------------------------------------------------------------------------
 62 | class Residualizer(object):
 63 |     def __init__(self, C_t):
 64 |         # center and orthogonalize
 65 |         self.Q_t, _ = torch.linalg.qr(C_t - C_t.mean(0))
 66 |         self.dof = C_t.shape[0] - 2 - C_t.shape[1]
 67 | 
 68 |     def transform(self, M_t, center=True):
 69 |         """Residualize rows of M wrt columns of C"""
 70 |         M0_t = M_t - M_t.mean(1, keepdim=True)
 71 |         if center:
 72 |             M0_t = M0_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t())
 73 |         else:
 74 |             M0_t = M_t - torch.mm(torch.mm(M0_t, self.Q_t), self.Q_t.t())
 75 |         return M0_t
 76 | 
 77 | 
 78 | def calculate_maf(genotype_t, alleles=2):
 79 |     """Calculate minor allele frequency"""
 80 |     af_t = genotype_t.sum(1) / (alleles * genotype_t.shape[1])
 81 |     return torch.where(af_t > 0.5, 1 - af_t, af_t)
 82 | 
 83 | 
 84 | def get_allele_stats(genotype_t):
 85 |     """Returns allele frequency, minor allele samples, and minor allele counts (row-wise)."""
 86 |     # allele frequency
 87 |     n2 = 2 * genotype_t.shape[1]
 88 |     af_t = genotype_t.sum(1) / n2
 89 |     # minor allele samples and counts
 90 |     ix_t = af_t <= 0.5
 91 |     m = genotype_t > 0.5
 92 |     a = m.sum(1).int()
 93 |     b = (genotype_t < 1.5).sum(1).int()
 94 |     ma_samples_t = torch.where(ix_t, a, b)
 95 |     a = (genotype_t * m.float()).sum(1).int()
 96 |     # a = (genotype_t * m.float()).sum(1).round().int()  # round for missing/imputed genotypes
 97 |     ma_count_t = torch.where(ix_t, a, n2-a)
 98 |     return af_t, ma_samples_t, ma_count_t
 99 | 
100 | 
101 | def filter_maf(genotypes_t, variant_ids, maf_threshold, alleles=2):
102 |     """Calculate MAF and filter genotypes that don't pass threshold"""
103 |     af_t = genotypes_t.sum(1) / (alleles * genotypes_t.shape[1])
104 |     maf_t = torch.where(af_t > 0.5, 1 - af_t, af_t)
105 |     if maf_threshold > 0:
106 |         mask_t = maf_t >= maf_threshold
107 |         genotypes_t = genotypes_t[mask_t]
108 |         variant_ids = variant_ids[mask_t.cpu().numpy().astype(bool)]
109 |         af_t = af_t[mask_t]
110 |     return genotypes_t, variant_ids, af_t
111 | 
112 | 
113 | def filter_maf_interaction(genotypes_t, interaction_mask_t=None, maf_threshold_interaction=0.05):
114 |     # filter monomorphic sites (to avoid colinearity)
115 |     mask_t = ~((genotypes_t==0).all(1) | (genotypes_t==1).all(1) | (genotypes_t==2).all(1))
116 |     if interaction_mask_t is not None:
117 |         upper_t = calculate_maf(genotypes_t[:, interaction_mask_t]) >= maf_threshold_interaction - 1e-7
118 |         lower_t = calculate_maf(genotypes_t[:,~interaction_mask_t]) >= maf_threshold_interaction - 1e-7
119 |         mask_t = mask_t & upper_t & lower_t
120 |     genotypes_t = genotypes_t[mask_t]
121 |     return genotypes_t, mask_t
122 | 
123 | 
124 | def impute_mean(genotypes_t, missing=-9):
125 |     """Impute missing genotypes to mean"""
126 |     m = genotypes_t == missing
127 |     ix = torch.nonzero(m, as_tuple=True)[0]
128 |     if len(ix) > 0:
129 |         a = genotypes_t.sum(1)
130 |         b = m.sum(1).float()
131 |         mu = (a - missing*b) / (genotypes_t.shape[1] - b)
132 |         genotypes_t[m] = mu[ix]
133 | 
134 | 
135 | def center_normalize(M_t, dim=0):
136 |     """Center and normalize M"""
137 |     N_t = M_t - M_t.mean(dim=dim, keepdim=True)
138 |     return N_t / torch.sqrt(torch.pow(N_t, 2).sum(dim=dim, keepdim=True))
139 | 
140 | 
141 | def calculate_corr(genotype_t, phenotype_t, residualizer=None, return_var=False):
142 |     """Calculate correlation between normalized residual genotypes and phenotypes"""
143 | 
144 |     # residualize
145 |     if residualizer is not None:
146 |         genotype_res_t = residualizer.transform(genotype_t)  # variants x samples
147 |         phenotype_res_t = residualizer.transform(phenotype_t)  # phenotypes x samples
148 |     else:
149 |         genotype_res_t = genotype_t
150 |         phenotype_res_t = phenotype_t
151 | 
152 |     if return_var:
153 |         genotype_var_t = genotype_res_t.var(1)
154 |         phenotype_var_t = phenotype_res_t.var(1)
155 | 
156 |     # center and normalize
157 |     genotype_res_t = center_normalize(genotype_res_t, dim=1)
158 |     phenotype_res_t = center_normalize(phenotype_res_t, dim=1)
159 | 
160 |     # correlation
161 |     if return_var:
162 |         return torch.mm(genotype_res_t, phenotype_res_t.t()), genotype_var_t, phenotype_var_t
163 |     else:
164 |         return torch.mm(genotype_res_t, phenotype_res_t.t())
165 | 
166 | 
167 | def get_t_pval(t, df, log=False):
168 |     """
169 |     Get p-value corresponding to t statistic and degrees of freedom (df). t and/or df can be arrays.
170 |     If log=True, returns -log10(P).
171 |     """
172 |     if not log:
173 |         return 2 * stats.t.cdf(-abs(t), df)
174 |     else:
175 |         if has_rpy2:
176 |             return -(rfunc.t_cdf(-abs(t), df, lower_tail=True, log=True) + np.log(2)) * np.log10(np.e)
177 |         else:
178 |             raise ValueError("R and rpy2 are required to compute -log10(P)")
179 | 
180 | 
181 | def calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t, residualizer=None,
182 |                                   return_sparse=False, tstat_threshold=None, variant_ids=None):
183 |     """
184 |     Solve y ~ g + i + g:i, where i is an interaction vector or matrix
185 | 
186 |     Inputs
187 |       genotypes_t:   [num_genotypes x num_samples]
188 |       phenotypes_t:  [num_phenotypes x num_samples]
189 |       interaction_t: [num_samples x num_interactions]
190 | 
191 |     Outputs
192 |     if return_sparse is False (default):
193 |       tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t
194 |       tstat_t, b_t, b_se_t columns: [g, i_1 ... i_n, gi_1, ... gi_n]
195 |                                     where n is the number of interactions
196 |     if return_sparse is True:
197 |       tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix
198 |       ix: indexes [genotype, phenotype]
199 |     """
200 |     ng, ns = genotypes_t.shape
201 |     nps = phenotypes_t.shape[0]
202 |     ni = interaction_t.shape[1]
203 | 
204 |     # centered inputs
205 |     g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)  # genotypes x samples
206 |     gi_t = (genotypes_t.unsqueeze(2) * interaction_t.unsqueeze(0))  # genotypes x samples x interactions
207 |     gi0_t = gi_t - gi_t.mean(1, keepdim=True)  # mean across samples
208 |     i0_t = interaction_t - interaction_t.mean(0)  # samples x interactions
209 |     p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)  # 1 x samples
210 | 
211 |     # residualize rows
212 |     if residualizer is not None:
213 |         p0_t = residualizer.transform(p0_t, center=False)
214 |         g0_t = residualizer.transform(g0_t, center=False)
215 |         i0_t = residualizer.transform(i0_t.t(), center=False).t()
216 |         for k in range(i0_t.shape[1]):
217 |             gi0_t[..., k] = residualizer.transform(gi0_t[..., k], center=False)
218 |     i0_t = i0_t.repeat(ng, 1, 1)
219 | 
220 |     # regression (in float; loss of precision may occur in edge cases)
221 |     X_t = torch.cat([g0_t.unsqueeze(-1), i0_t, gi0_t], 2)  # ng x ns x (1+2*ni)
222 |     try:
223 |         Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x (1+2*ni) x (1+2*ni)
224 |     except Exception as e:
225 |         if variant_ids is not None and len(e.args) >= 1:
226 |             i = int(re.findall('For batch (\d+)', str(e))[0])
227 |             e.args = (e.args[0] + f'\n    Likely problematic variant: {variant_ids[i]} ',) + e.args[1:]
228 |         raise
229 | 
230 |     p0_tile_t = p0_t.unsqueeze(0).expand([ng, *p0_t.shape])  # ng x np x ns
231 | 
232 |     # calculate b, b_se
233 |     # [(ng x nb x nb) x (ng x nb x ns)] x (ng x ns x np) = (ng x nb x np)
234 |     b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), torch.transpose(p0_tile_t, 1, 2))
235 |     nb = b_t.shape[1]
236 |     # residualizer.dof already includes intercept, b_g, add b_i and b_gi for each interaction
237 |     if residualizer is not None:
238 |         dof = residualizer.dof - 2*ni
239 |     else:
240 |         dof = phenotypes_t.shape[1] - 2 - 2*ni
241 |     if nps == 1:  # single phenotype case
242 |         r_t = torch.matmul(X_t, b_t).squeeze() - p0_t
243 |         rss_t = (r_t*r_t).sum(1)
244 |         b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()] * rss_t.unsqueeze(1) / dof)
245 |         b_t = b_t.squeeze(2)
246 |         # r_t = tf.squeeze(tf.matmul(X_t, b_t)) - p0_t  # (ng x ns x 3) x (ng x 3 x 1)
247 |         # rss_t = tf.reduce_sum(tf.multiply(r_t, r_t), axis=1)
248 |         # b_se_t = tf.sqrt( tf.matrix_diag_part(Xinv) * tf.expand_dims(rss_t, 1) / dof )
249 |     else:
250 |         # b_t = tf.matmul(p0_tile_t, tf.matmul(Xinv, X_t, transpose_b=True), transpose_b=True)
251 |         # convert to ng x np x 3??
252 |         r_t = torch.matmul(X_t, b_t) - torch.transpose(p0_tile_t, 1, 2)  # (ng x ns x np)
253 |         rss_t = (r_t*r_t).sum(1)  # ng x np
254 |         b_se_t = torch.sqrt(Xinv[:, torch.eye(nb, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof)
255 |         # b_se_t = tf.sqrt(tf.tile(tf.expand_dims(tf.matrix_diag_part(Xinv), 2), [1,1,nps]) * tf.tile(tf.expand_dims(rss_t, 1), [1,3,1]) / dof) # (ng x 3) -> (ng x 3 x np)
256 | 
257 |     tstat_t = (b_t.double() / b_se_t.double()).float()  # (ng x nb x np)
258 | 
259 |     # tdist = tfp.distributions.StudentT(np.float64(dof), loc=np.float64(0.0), scale=np.float64(1.0))
260 |     if not return_sparse:
261 |         # calculate pval
262 |         # pval_t = tf.scalar_mul(2, tdist.cdf(-tf.abs(tstat_t)))  # (ng x 3 x np)
263 |         af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t)
264 |         return tstat_t, b_t, b_se_t, af_t, ma_samples_t, ma_count_t
265 | 
266 |     else:  # sparse output
267 |         if ni > 1:
268 |             raise NotImplementedError("Sparse mode not yet supported for >1 interactions")
269 |         af_t = genotypes_t.sum(1) / (2*ns)
270 |         tstat_g_t =  tstat_t[:,0,:]  # genotypes x phenotypes
271 |         tstat_i_t =  tstat_t[:,1,:]
272 |         tstat_gi_t = tstat_t[:,2,:]
273 |         m = tstat_gi_t.abs() >= tstat_threshold
274 |         tstat_g_t = tstat_g_t[m]
275 |         tstat_i_t = tstat_i_t[m]
276 |         tstat_gi_t = tstat_gi_t[m]
277 |         ix = m.nonzero(as_tuple=False)  # indexes: [genotype, phenotype]
278 |         return tstat_g_t, tstat_i_t, tstat_gi_t, af_t[ix[:,0]], ix
279 | 
280 | 
281 | def linreg(X_t, y_t, dtype=torch.float64):
282 |     """
283 |     Robust linear regression. Solves y = Xb, standardizing X.
284 |     The first column of X must be the intercept.
285 |     """
286 |     x_std_t = X_t.std(0)
287 |     x_mean_t = X_t.mean(0)
288 |     x_std_t[0] = 1
289 |     x_mean_t[0] = 0
290 | 
291 |     # standardize X
292 |     Xtilde_t = (X_t - x_mean_t) / x_std_t
293 | 
294 |     # regression
295 |     XtX_t = torch.matmul(Xtilde_t.T, Xtilde_t)
296 |     Xty_t = torch.matmul(Xtilde_t.T, y_t)
297 |     b_t = torch.linalg.solve(XtX_t, Xty_t.unsqueeze(-1))
298 |     b_t = b_t.squeeze()
299 | 
300 |     # compute s.e.
301 |     dof = X_t.shape[0] - X_t.shape[1]
302 |     r_t = y_t - torch.matmul(Xtilde_t, b_t)
303 |     sigma2_t = (r_t*r_t).sum() / dof
304 |     XtX_inv_t = torch.linalg.solve(XtX_t, torch.eye(X_t.shape[1], dtype=dtype).to(X_t.device))
305 |     var_b_t = sigma2_t * XtX_inv_t
306 |     b_se_t = torch.sqrt(torch.diag(var_b_t))
307 | 
308 |     # rescale
309 |     b_t /= x_std_t
310 |     b_se_t /= x_std_t
311 | 
312 |     # adjust intercept
313 |     b_t[0] -= torch.sum(x_mean_t * b_t)
314 |     ms_t = x_mean_t / x_std_t
315 |     b_se_t[0] = torch.sqrt(b_se_t[0]**2 + torch.matmul(torch.matmul(ms_t.T, var_b_t), ms_t))
316 | 
317 |     return b_t, b_se_t
318 | 
319 | 
320 | def filter_covariates(covariates_t, log_counts_t, tstat_threshold=2):
321 |     """
322 |     Inputs:
323 |       covariates0_t: covariates matrix (samples x covariates)
324 |                      including genotype PCs, PEER factors, etc.
325 |                      ** with intercept in first column **
326 |       log_counts_t:  counts vector (samples)
327 |     """
328 |     assert (covariates_t[:,0] == 0).all()
329 |     b_t, b_se_t = linreg(covariates_t, log_counts_t)
330 |     tstat_t = b_t / b_se_t
331 |     m = tstat_t.abs() > tstat_threshold
332 |     m[0] = False
333 |     return covariates_t[:, m]
334 | 
335 | 
336 | #------------------------------------------------------------------------------
337 | #  Functions for beta-approximating empirical p-values
338 | #------------------------------------------------------------------------------
339 | def pval_from_corr(r2, dof, logp=False):
340 |     tstat2 = dof * r2 / (1 - r2)
341 |     return get_t_pval(np.sqrt(tstat2), dof, log=logp)
342 | 
343 | 
344 | def beta_shape_1_from_dof(r2, dof):
345 |     """compute the Beta shape 1 parameter from moment matching"""
346 |     pval = pval_from_corr(r2, dof)
347 |     mean = np.mean(pval)
348 |     var = np.var(pval)
349 |     return mean * (mean * (1.0-mean) / var - 1.0)
350 | 
351 | 
352 | def beta_log_likelihood(x, shape1, shape2):
353 |     """negative log-likelihood of beta distribution"""
354 |     logbeta = loggamma(shape1) + loggamma(shape2) - loggamma(shape1+shape2)
355 |     return (1.0-shape1)*np.sum(np.log(x)) + (1.0-shape2)*np.sum(np.log(1.0-x)) + len(x)*logbeta
356 | 
357 | 
358 | def fit_beta_parameters(r2_perm, dof_init, tol=1e-4, return_minp=False):
359 |     """
360 |       r2_perm:    array of max. r2 values from permutations
361 |       dof_init:   degrees of freedom
362 |     """
363 |     try:
364 |         # Find the degrees of freedom such that the first beta parameter is
365 |         # close to 1, by finding the root where the log of the beta parameter
366 |         # as a function of r2_perm and dof is 0.  Optimizing log(beta shape 1)
367 |         # with a parameterization of log(dof) makes this close to a linear
368 |         # function.
369 |         log_true_dof = scipy.optimize.newton(lambda x: np.log(beta_shape_1_from_dof(r2_perm, np.exp(x))),
370 |                                              np.log(dof_init), tol=tol, maxiter=50)
371 |         true_dof = np.exp(log_true_dof)
372 |     except:
373 |         # fall back to minimization
374 |         print('WARNING: scipy.optimize.newton failed to converge (running scipy.optimize.minimize)')
375 |         res = scipy.optimize.minimize(lambda x: np.abs(beta_shape_1_from_dof(r2_perm, x) - 1),
376 |                                       dof_init, method='Nelder-Mead', tol=tol)
377 |         true_dof = res.x[0]
378 | 
379 |     pval = pval_from_corr(r2_perm, true_dof)
380 |     mean, var = np.mean(pval), np.var(pval)
381 |     beta_shape1 = mean * (mean * (1 - mean) / var - 1)
382 |     beta_shape2 = beta_shape1 * (1/mean - 1)
383 |     res = scipy.optimize.minimize(lambda s: beta_log_likelihood(pval, s[0], s[1]), [beta_shape1, beta_shape2], method='Nelder-Mead', tol=tol)
384 |     beta_shape1, beta_shape2 = res.x
385 |     if return_minp:
386 |         return beta_shape1, beta_shape2, true_dof, pval
387 |     else:
388 |         return beta_shape1, beta_shape2, true_dof
389 | 
390 | 
391 | def calculate_beta_approx_pval(r2_perm, r2_nominal, dof_init, tol=1e-4):
392 |     """
393 |       r2_nominal: nominal max. r2 (scalar or array)
394 |       r2_perm:    array of max. r2 values from permutations
395 |       dof_init:   degrees of freedom
396 |     """
397 |     beta_shape1, beta_shape2, true_dof = fit_beta_parameters(r2_perm, dof_init, tol)
398 |     pval_true_dof = pval_from_corr(r2_nominal, true_dof)
399 |     pval_beta = stats.beta.cdf(pval_true_dof, beta_shape1, beta_shape2)
400 |     return pval_beta, beta_shape1, beta_shape2, true_dof, pval_true_dof
401 | 
402 | #------------------------------------------------------------------------------
403 | #  i/o functions
404 | #------------------------------------------------------------------------------
405 | 
406 | def read_phenotype_bed(phenotype_bed):
407 |     """Load phenotype BED file as phenotype and position DataFrames"""
408 |     if phenotype_bed.lower().endswith(('.bed.gz', '.bed')):
409 |         phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str})
410 |     elif phenotype_bed.lower().endswith('.bed.parquet'):
411 |         phenotype_df = pd.read_parquet(phenotype_bed)
412 |         phenotype_df.set_index(phenotype_df.columns[3], inplace=True)
413 |     else:
414 |         raise ValueError('Unsupported file type.')
415 |     phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True)
416 | 
417 |     phenotype_df['start'] += 1  # change to 1-based
418 |     pos_df = phenotype_df[['chr', 'start', 'end']]
419 |     phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True)
420 | 
421 |     # make sure BED file is properly sorted
422 |     assert pos_df.equals(
423 |         pos_df.groupby('chr', sort=False, group_keys=False).apply(lambda x: x.sort_values(['start', 'end']))
424 |     ), "Positions in BED file must be sorted."
425 | 
426 |     if (pos_df['start'] == pos_df['end']).all():
427 |         pos_df = pos_df[['chr', 'end']].rename(columns={'end':'pos'})
428 | 
429 |     return phenotype_df, pos_df
430 | 


--------------------------------------------------------------------------------
/tensorqtl/eigenmt.py:
--------------------------------------------------------------------------------
  1 | """eigenmt.py: Re-implementation of eigenMT (Davis et al., AJHG, 2016)"""
  2 | 
  3 | __author__ = "Francois Aguet"
  4 | __copyright__ = "Copyright 2019, The Broad Institute"
  5 | __license__ = "BSD3"
  6 | 
  7 | import torch
  8 | import numpy as np
  9 | import pandas as pd
 10 | import time
 11 | import os
 12 | import sys
 13 | from collections import OrderedDict
 14 | 
 15 | sys.path.insert(1, os.path.dirname(__file__))
 16 | import genotypeio
 17 | from core import *
 18 | 
 19 | 
 20 | def lw_shrink(X_t):
 21 |     """
 22 |     Estimates the shrunk Ledoit-Wolf covariance matrix
 23 | 
 24 |     Args:
 25 |       X_t: samples x variants
 26 | 
 27 |     Returns:
 28 |       shrunk_cov_t: shrunk covariance
 29 |       shrinkage_t:  shrinkage coefficient
 30 | 
 31 |     Adapted from scikit-learn:
 32 |     https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/covariance/shrunk_covariance_.py
 33 |     """
 34 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 35 | 
 36 |     if len(X_t.shape) == 2:
 37 |         n_samples, n_features = X_t.shape  # samples x variants
 38 |         X_t = X_t - X_t.mean(0)
 39 |         X2_t = X_t.pow(2)
 40 |         emp_cov_trace_sum = X2_t.sum() / n_samples
 41 |         delta_ = torch.mm(X_t.t(), X_t).pow(2).sum() / n_samples**2
 42 |         beta_ = torch.mm(X2_t.t(), X2_t).sum()
 43 |         beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
 44 |         delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features
 45 |         delta /= n_features
 46 |         beta = torch.min(beta, delta)
 47 |         shrinkage_t = 0 if beta == 0 else beta / delta
 48 |         emp_cov_t = torch.mm(X_t.t(), X_t) / n_samples
 49 |         mu_t = torch.trace(emp_cov_t) / n_features
 50 |         shrunk_cov_t = (1. - shrinkage_t) * emp_cov_t
 51 |         shrunk_cov_t.view(-1)[::n_features + 1] += shrinkage_t * mu_t  # add to diagonal
 52 |     else:  # broadcast along first dimension
 53 |         n_samples, n_features = X_t.shape[1:]  # samples x variants
 54 |         X_t = X_t - X_t.mean(1, keepdim=True)
 55 |         X2_t = X_t.pow(2)
 56 |         emp_cov_trace_sum = X2_t.sum([1,2]) / n_samples
 57 |         delta_ = torch.matmul(torch.transpose(X_t, 1, 2), X_t).pow(2).sum([1,2]) / n_samples**2
 58 |         beta_ = torch.matmul(torch.transpose(X2_t, 1, 2), X2_t).sum([1,2])
 59 |         beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
 60 |         delta = delta_ - 1. * emp_cov_trace_sum**2 / n_features
 61 |         delta /= n_features
 62 |         beta = torch.min(beta, delta)
 63 |         shrinkage_t = torch.where(beta==0, torch.zeros(beta.shape).to(device), beta/delta)
 64 |         emp_cov_t = torch.matmul(torch.transpose(X_t, 1, 2), X_t) / n_samples
 65 |         mu_t = torch.diagonal(emp_cov_t, dim1=1, dim2=2).sum(1) / n_features
 66 |         shrunk_cov_t = (1 - shrinkage_t.reshape([shrinkage_t.shape[0], 1, 1])) * emp_cov_t
 67 | 
 68 |         ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(X_t.shape[0])])).to(device)
 69 |         shrunk_cov_t.view(-1)[ix] += (shrinkage_t * mu_t).unsqueeze(-1)  # add to diagonal
 70 | 
 71 |     return shrunk_cov_t, shrinkage_t
 72 | 
 73 | 
 74 | def find_num_eigs(eigenvalues, variance, var_thresh=0.99):
 75 |     """Returns the number of eigenvalues required to reach threshold of variance explained."""
 76 |     eigenvalues = np.sort(eigenvalues)[::-1]
 77 |     running_sum = 0
 78 |     counter = 0
 79 |     while running_sum < variance * var_thresh:
 80 |         running_sum += eigenvalues[counter]
 81 |         counter += 1
 82 |     return counter
 83 | 
 84 | 
 85 | def compute_tests(genotypes_t, var_thresh=0.99, variant_window=200):
 86 |     """determine effective number of independent variants (M_eff)"""
 87 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 88 | 
 89 |     # break into windows
 90 |     windows = torch.split(genotypes_t, variant_window)
 91 | 
 92 |     if len(windows)>1:
 93 |         shrunk_cov_t, shrinkage_t = lw_shrink(torch.transpose(torch.stack(windows[:-1]), 1, 2))
 94 | 
 95 |         n_samples, n_features = windows[0].T.shape
 96 |         # indices of diagonals
 97 |         ix = torch.LongTensor(np.array([np.arange(0, n_features**2, n_features+1)+i*n_features**2 for i in range(shrunk_cov_t.shape[0])])).to(device)
 98 |         shrunk_precision_t = torch.zeros(shrunk_cov_t.shape).to(device)
 99 |         shrunk_precision_t.view(-1)[ix] = shrunk_cov_t.view(-1)[ix].pow(-0.5)
100 |         shrunk_cor_t = torch.matmul(torch.matmul(shrunk_precision_t, shrunk_cov_t), shrunk_precision_t)
101 |         # eigenvalues_t,_ = torch.symeig(shrunk_cor_t, eigenvectors=False)  # will be deprecated
102 |         eigenvalues_t = torch.linalg.eigvalsh(shrunk_cor_t)  # ~2x slower than symeig with 1.10.0+cu102 and 2.0.1+cu118
103 | 
104 |     # last window
105 |     shrunk_cov0_t, shrinkage0_t = lw_shrink(windows[-1].t())
106 |     shrunk_precision0_t = torch.diag(torch.diag(shrunk_cov0_t).pow(-0.5))
107 |     shrunk_cor0_t = torch.mm(torch.mm(shrunk_precision0_t, shrunk_cov0_t), shrunk_precision0_t)
108 |     # eigenvalues0_t,_ = torch.symeig(shrunk_cor0_t, eigenvectors=False)
109 |     eigenvalues0_t = torch.linalg.eigvalsh(shrunk_cor0_t)
110 | 
111 |     if len(windows) > 1:
112 |         eigenvalues = list(eigenvalues_t.cpu().numpy())
113 |         eigenvalues.append(eigenvalues0_t.cpu().numpy())
114 |     else:
115 |         eigenvalues = [eigenvalues0_t.cpu().numpy()]
116 | 
117 |     m_eff = 0
118 |     for ev,m in zip(eigenvalues, [i.shape[0] for i in windows]):
119 |         ev[ev < 0] = 0
120 |         m_eff += find_num_eigs(ev, m, var_thresh=var_thresh)
121 | 
122 |     return m_eff
123 | 
124 | 
125 | 
126 | def run_eigenmt(genotype_df, variant_df, phenotype_df, phenotype_pos_df, interaction_s=None,
127 |                 maf_threshold=0, var_thresh=0.99, variant_window=200, window=1000000, verbose=True, logger=None):
128 |     """Standalone function for computing eigenMT correction.
129 | 
130 |     Returns the number of tests for each gene
131 |     """
132 | 
133 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134 | 
135 |     if logger is None:
136 |         logger = SimpleLogger()
137 | 
138 |     logger.write('eigenMT: estimating number of independent variants tested for each phenotype')
139 | 
140 |     logger.write('cis-QTL mapping: empirical p-values for phenotypes')
141 |     logger.write(f'  * {phenotype_df.shape[1]} samples')
142 |     logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
143 |     logger.write(f'  * {genotype_df.shape[0]} variants')
144 | 
145 |     if interaction_s is not None and maf_threshold > 0:
146 |         interaction_mask_t = torch.BoolTensor(interaction_s >= interaction_s.median()).to(device)
147 |     else:
148 |         interaction_mask_t = None
149 | 
150 |     genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
151 |     genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
152 | 
153 |     igc = genotypeio.InputGeneratorCis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, window=window)
154 |     start_time = time.time()
155 |     m_eff = OrderedDict()
156 |     for k, (phenotype, genotypes, genotype_range, phenotype_id) in enumerate(igc.generate_data(verbose=verbose), 1):
157 | 
158 |         # copy genotypes to GPU
159 |         genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
160 |         genotypes_t = genotypes_t[:,genotype_ix_t]
161 |         impute_mean(genotypes_t)
162 | 
163 |         if interaction_s is None:
164 |             mask_t = calculate_maf(genotypes_t) >= maf_threshold
165 |             genotypes_t = genotypes_t[mask_t]
166 |         else:
167 |             genotypes_t, mask_t = filter_maf_interaction(genotypes_t, interaction_mask_t=interaction_mask_t, maf_threshold_interaction=maf_threshold)
168 | 
169 |         m_eff[phenotype_id] = compute_tests(genotypes_t, var_thresh=var_thresh, variant_window=variant_window)
170 | 
171 |     logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
172 |     return pd.Series(m_eff)
173 | 
174 | 
175 | def padjust_bh(p):
176 |     """Benjamini-Hochberg adjusted p-values"""
177 |     if not np.all(np.isfinite(p)):
178 |         raise ValueError('P values must be finite.')
179 |     n = len(p)
180 |     i = np.arange(n,0,-1)
181 |     o = np.argsort(p)[::-1]
182 |     ro = np.argsort(o)
183 |     return np.minimum(1, np.minimum.accumulate(np.float64(n)/i * np.array(p)[o]))[ro]
184 | 


--------------------------------------------------------------------------------
/tensorqtl/genotypeio.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import tempfile
  3 | import numpy as np
  4 | import subprocess
  5 | import os
  6 | import gzip
  7 | import sys
  8 | import threading
  9 | import queue
 10 | import bisect
 11 | from pandas_plink import read_plink
 12 | 
 13 | sys.path.insert(1, os.path.dirname(__file__))
 14 | from core import *
 15 | 
 16 | try:
 17 |     import pgen
 18 | except ImportError as e:
 19 |     pgen = None
 20 | 
 21 | 
 22 | gt_to_dosage_dict = {'0/0':0, '0/1':1, '1/1':2, './.':np.nan,
 23 |                      '0|0':0, '0|1':1, '1|0':1, '1|1':2, '.|.':np.nan}
 24 | 
 25 | 
 26 | def _check_dependency(name):
 27 |     e = subprocess.call(f"which {name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 28 |     if e != 0:
 29 |         raise RuntimeError(f"External dependency '{name}' not installed")
 30 | 
 31 | 
 32 | def print_progress(k, n, entity):
 33 |     s = f'\r    processing {entity} {k}/{n}'
 34 |     if k == n:
 35 |         s += '\n'
 36 |     sys.stdout.write(s)
 37 |     sys.stdout.flush()
 38 | 
 39 | 
 40 | class BackgroundGenerator(threading.Thread):
 41 |     # Adapted from https://github.com/justheuristic/prefetch_generator
 42 |     def __init__(self, generator, max_prefetch=10):
 43 |         threading.Thread.__init__(self)
 44 |         self.queue = queue.Queue(max_prefetch)
 45 |         self.generator = generator
 46 |         self.daemon = True
 47 |         self.start()
 48 | 
 49 |     def run(self):
 50 |         try:
 51 |             for item in self.generator:
 52 |                 self.queue.put(item)
 53 |         except Exception as exception:
 54 |             self.queue.put(exception)
 55 |         self.queue.put(None)
 56 | 
 57 |     def next(self):
 58 |         next_item = self.queue.get()
 59 |         if next_item is None:
 60 |             self.join()
 61 |             raise StopIteration
 62 |         if isinstance(next_item, Exception):
 63 |             self.join()
 64 |             raise next_item
 65 |         return next_item
 66 | 
 67 |     def __next__(self):
 68 |         return self.next()
 69 | 
 70 |     def __iter__(self):
 71 |         return self
 72 | 
 73 | class background:
 74 |     def __init__(self, max_prefetch=10):
 75 |         self.max_prefetch = max_prefetch
 76 |     def __call__(self,gen):
 77 |         def bg_generator(*args,**kwargs):
 78 |             return BackgroundGenerator(gen(*args,**kwargs), max_prefetch=self.max_prefetch)
 79 |         return bg_generator
 80 | 
 81 | 
 82 | #------------------------------------------------------------------------------
 83 | #  Functions for writing VCFs
 84 | #------------------------------------------------------------------------------
 85 | def _get_vcf_opener(vcfpath):
 86 |     if vcfpath.endswith('.vcf.gz'):
 87 |         return gzip.open(vcfpath, 'rt')
 88 |     else:
 89 |         return open(vcfpath)
 90 | 
 91 | 
 92 | def get_sample_ids(vcfpath):
 93 |     """Get sample IDs from VCF"""
 94 |     with _get_vcf_opener(vcfpath) as vcf:
 95 |         for header in vcf:
 96 |             if header[:2] == '##': continue
 97 |             break
 98 |     return header.strip().split('\t')[9:]
 99 | 
100 | 
101 | def parse_genotypes(x, field='GT'):
102 |     """Convert list of genotypes (str) to np.float32"""
103 |     if field == 'GT':
104 |         g = np.float32([gt_to_dosage_dict[i] for i in x])
105 |     elif field == 'DS':
106 |         g = np.float32(x)
107 |     return g
108 | 
109 | 
110 | def _get_field_ix(line, field):
111 |     """Get position of field ('GT' or 'DS') in FORMAT"""
112 |     fmt = line[8].split(':')
113 |     if field not in fmt:
114 |         raise ValueError(f'FORMAT field does not contain {field}')
115 |     return fmt.index(field)
116 | 
117 | #------------------------------------------------------------------------------
118 | #  Functions for loading regions/variants from VCFs
119 | #------------------------------------------------------------------------------
120 | def _impute_mean(g, missing=-9, verbose=False):
121 |     """Impute rows to mean (in place)"""
122 |     if not g.dtype in [np.float32, np.float64]:
123 |         raise ValueError('Input dtype must be np.float32 or np.float64')
124 |     n = 0
125 |     for i in np.where((g == missing).any(1))[0]:
126 |         ix = g[i] == missing
127 |         g[i][ix] = np.mean(g[i][~ix])
128 |         n += 1
129 |     if verbose and n > 0:
130 |         print(f'    imputed at least 1 sample in {n}/{g.shape[0]} sites')
131 | 
132 | 
133 | class PlinkReader(object):
134 |     def __init__(self, plink_prefix_path, select_samples=None, include_variants=None,
135 |                  exclude_variants=None, exclude_chrs=None, verbose=True, dtype=np.int8):
136 |         """
137 |         Class for reading genotypes from PLINK bed files
138 | 
139 |         plink_prefix_path: prefix to PLINK bed,bim,fam files
140 |         select_samples: specify a subset of samples
141 | 
142 |         Notes:
143 |           Use this command to convert a VCF to PLINK format:
144 |             plink2 --make-bed \
145 |                 --output-chr chrM \
146 |                 --vcf ${plink_prefix_path}.vcf.gz \
147 |                 --out ${plink_prefix_path}
148 | 
149 |             If using plink v1, the --keep-allele-order flag must be included.
150 | 
151 |           Uses read_plink from pandas_plink.
152 |         """
153 | 
154 |         self.bim, self.fam, self.bed = read_plink(plink_prefix_path, verbose=verbose)
155 |         self.bed = 2 - self.bed  # flip allele order: PLINK uses REF as effect allele
156 |         if dtype == np.int8:
157 |             self.bed[np.isnan(self.bed)] = -9  # convert missing (NaN) to -9 for int8
158 |         self.bed = self.bed.astype(dtype, copy=False)
159 |         self.sample_ids = self.fam['iid'].tolist()
160 |         if select_samples is not None:
161 |             ix = [self.sample_ids.index(i) for i in select_samples]
162 |             self.fam = self.fam.loc[ix]
163 |             self.bed = self.bed[:,ix]
164 |             self.sample_ids = self.fam['iid'].tolist()
165 |         if include_variants is not None:
166 |             m = self.bim['snp'].isin(include_variants).values
167 |             self.bed = self.bed[m,:]
168 |             self.bim = self.bim[m]
169 |             self.bim.reset_index(drop=True, inplace=True)
170 |             self.bim['i'] = self.bim.index
171 |         if exclude_variants is not None:
172 |             m = ~self.bim['snp'].isin(exclude_variants).values
173 |             self.bed = self.bed[m,:]
174 |             self.bim = self.bim[m]
175 |             self.bim.reset_index(drop=True, inplace=True)
176 |             self.bim['i'] = self.bim.index
177 |         if exclude_chrs is not None:
178 |             m = ~self.bim['chrom'].isin(exclude_chrs).values
179 |             self.bed = self.bed[m,:]
180 |             self.bim = self.bim[m]
181 |             self.bim.reset_index(drop=True, inplace=True)
182 |             self.bim['i'] = self.bim.index
183 |         self.n_samples = self.fam.shape[0]
184 |         self.chrs = list(self.bim['chrom'].unique())
185 |         self.variant_pos = {i:g['pos'] for i,g in self.bim.set_index('snp')[['chrom', 'pos']].groupby('chrom')}
186 |         self.variant_pos_dict = self.bim.set_index('snp')['pos'].to_dict()
187 | 
188 |     def get_region_index(self, region_str, return_pos=False):
189 |         s = region_str.split(':')
190 |         chrom = s[0]
191 |         c = self.bim[self.bim['chrom'] == chrom]
192 |         if len(s) > 1:
193 |             start, end = s[1].split('-')
194 |             start = int(start)
195 |             end = int(end)
196 |             c = c[(c['pos'] >= start) & (c['pos'] <= end)]
197 |         if return_pos:
198 |             return c['i'].values, c.set_index('snp')['pos']
199 |         else:
200 |             return c['i'].values
201 | 
202 |     def get_region(self, region_str, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
203 |         """Get genotypes for a region defined by 'chr:start-end' or 'chr'"""
204 |         ix, pos_s = self.get_region_index(region_str, return_pos=True)
205 |         g = self.bed[ix, :].compute().astype(dtype)
206 |         if sample_ids is not None:
207 |             ix = [self.sample_ids.index(i) for i in sample_ids]
208 |             g = g[:, ix]
209 |         if impute:
210 |             _impute_mean(g, verbose=verbose)
211 |         return g, pos_s
212 | 
213 |     def get_genotypes(self, variant_ids, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
214 |         """Load genotypes for selected variant IDs"""
215 |         c = self.bim[self.bim['snp'].isin(variant_ids)]
216 |         g = self.bed[c.i.values, :].compute().astype(dtype)
217 |         if sample_ids is not None:
218 |             ix = [self.sample_ids.index(i) for i in sample_ids]
219 |             g = g[:, ix]
220 |         if impute:
221 |             _impute_mean(g, verbose=verbose)
222 |         return g, c.set_index('snp')['pos']
223 | 
224 |     def get_genotype(self, variant_id, sample_ids=None, impute=False, verbose=False, dtype=np.int8):
225 |         """Load genotypes for a single variant ID as pd.Series"""
226 |         g,_ = self.get_genotypes([variant_id], sample_ids=sample_ids, impute=impute, verbose=verbose, dtype=dtype)
227 |         if sample_ids is None:
228 |             return pd.Series(g[0], index=self.fam['iid'], name=variant_id)
229 |         else:
230 |             return pd.Series(g[0], index=sample_ids, name=variant_id)
231 | 
232 |     def load_genotypes(self):
233 |         """Load all genotypes into memory, as pd.DataFrame"""
234 |         return pd.DataFrame(self.bed.compute(), index=self.bim['snp'], columns=self.fam['iid'])
235 | 
236 | 
237 | def load_genotypes(genotype_path, select_samples=None, dosages=False):
238 |     """Load all genotypes into a dataframe"""
239 |     if all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]):
240 |         if pgen is None:
241 |             raise ImportError('Pgenlib must be installed to use PLINK 2 pgen/psam/pvar files.')
242 |         pgr = pgen.PgenReader(genotype_path, select_samples=select_samples)
243 |         variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']]
244 |         if dosages:
245 |             genotype_df = pgr.load_dosages()
246 |         else:
247 |             genotype_df = pgr.load_genotypes()
248 |     elif all([os.path.exists(f"{genotype_path}.{ext}") for ext in ['bed', 'bim', 'fam']]):
249 |         pr = PlinkReader(genotype_path, select_samples=select_samples, dtype=np.int8)
250 |         genotype_df = pr.load_genotypes()
251 |         variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
252 |     elif genotype_path.endswith(('.bed.parquet', '.bed.gz', '.bed')):
253 |         genotype_df, variant_df = read_phenotype_bed(genotype_path)
254 |         assert variant_df.columns[1] == 'pos', "The BED file must define a single position for each variant, with start + 1 == end."
255 |         variant_df.columns = ['chrom', 'pos']
256 |     elif genotype_path.endswith('.parquet'):
257 |         genotype_df = pd.read_parquet(genotype_path)
258 |         variant_df = None
259 |     elif genotype_path.endswith('.gz'):
260 |         with gzip.open(genotype_path, 'rt') as f:
261 |             header = f.readline().strip().split('\t')
262 |         dtypes = {i:np.float32 for i in header}
263 |         dtypes[header[0]] = str
264 |         genotype_df = pd.read_csv(genotype_path, sep='\t', index_col=0, dtype=dtypes)
265 |         variant_df = None
266 |     else:
267 |         raise ValueError(f"Failed to load genotypes from {genotype_path}. Supported formats: pgen/psam/pvar, bed/bim/fam, parquet, tsv.gz")
268 |     return genotype_df, variant_df
269 | 
270 | 
271 | def get_vcf_region(region_str, vcfpath, field='GT', sample_ids=None, select_samples=None, impute_missing=True):
272 |     """Load VCF region (str: 'chr:start-end') as DataFrame (requires tabix)"""
273 |     s = subprocess.check_output(f'tabix {vcfpath} {region_str}', shell=True)
274 |     s = s.decode().strip().split('\n')
275 |     s = [i.split('\t') for i in s]
276 | 
277 |     if sample_ids is None:
278 |         sample_ids = get_sample_ids(vcfpath)
279 |     variant_ids = [i[2] for i in s]
280 |     pos_s = pd.Series([int(i[1]) for i in s], index=variant_ids)
281 | 
282 |     ix = _get_field_ix(s[0], field)
283 |     g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s])
284 |     df = pd.DataFrame(g, index=variant_ids, columns=sample_ids)
285 | 
286 |     if select_samples is not None:
287 |         df = df[select_samples]
288 | 
289 |     if impute_missing:
290 |         n = 0
291 |         for v in df.values:
292 |             m = np.isnan(v)
293 |             if np.any(m):
294 |                 v[m] = np.mean(v[~m])
295 |                 n += 1
296 |         if n > 0:
297 |             print(f'    imputed at least 1 sample in {n} sites')
298 | 
299 |     return df, pos_s
300 | 
301 | 
302 | def get_vcf_variants(variant_ids, vcfpath, field='GT', sample_ids=None):
303 |     """Load a set of variants in VCF as DataFrame (requires tabix)"""
304 |     variant_id_set = set(variant_ids)
305 |     with tempfile.NamedTemporaryFile() as regions_file:
306 |         df = pd.DataFrame([i.split('_')[:2] for i in variant_id_set], columns=['chr', 'pos'])
307 |         df['pos'] = df['pos'].astype(int)
308 |         df = df.sort_values(['chr', 'pos'])
309 |         df.to_csv(regions_file.name, sep='\t', index=False, header=False)
310 |         s = subprocess.check_output(f'tabix {vcfpath} --regions {regions_file.name}', shell=True)
311 |     s = s.decode().strip().split('\n')
312 |     s = [i.split('\t') for i in s]
313 | 
314 |     if sample_ids is None:
315 |         sample_ids = get_sample_ids(vcfpath)
316 | 
317 |     ix = _get_field_ix(s[0], field)
318 |     g = np.array([parse_genotypes([i.split(':')[ix] for i in line[9:]], field=field) for line in s])
319 |     g = np.array([i for i in g if -1 not in i])  # filter missing here instead of ValueError?
320 | 
321 |     returned_variant_ids = [i[2] for i in s]
322 |     ix = [k for k,i in enumerate(returned_variant_ids) if i in variant_id_set]
323 |     g = np.array([g[i] for i in ix])
324 |     returned_variant_ids = [returned_variant_ids[i] for i in ix]
325 |     return pd.DataFrame(g.astype(np.float32), index=returned_variant_ids, columns=sample_ids)
326 | 
327 | #------------------------------------------------------------------------------
328 | #  Generator classes for batch processing of genotypes/phenotypes
329 | #------------------------------------------------------------------------------
330 | class GenotypeGeneratorTrans(object):
331 |     def __init__(self, genotype_df, batch_size=50000, chr_s=None):
332 |         """
333 |         Generator for iterating over all variants (trans-scan)
334 | 
335 |         Inputs:
336 |           genotype_df: Dataframe with genotypes (variants x samples)
337 |           batch_size: Batch size for GPU processing
338 | 
339 |         Generates: genotype array (2D), variant ID array
340 |         """
341 |         self.genotype_df = genotype_df
342 |         self.batch_size = batch_size
343 |         self.num_batches = int(np.ceil(self.genotype_df.shape[0] / batch_size))
344 |         self.batch_indexes = [[i*batch_size, (i+1)*batch_size] for i in range(self.num_batches)]
345 |         self.batch_indexes[-1][1] = self.genotype_df.shape[0]
346 |         if chr_s is not None:
347 |             chroms, chr_ix = np.unique(chr_s, return_index=True)
348 |             s = np.argsort(chr_ix)
349 |             self.chroms = chroms[s]
350 |             chr_ix = list(chr_ix[s]) + [chr_s.shape[0]]
351 |             size_s = pd.Series(np.diff(chr_ix), index=self.chroms)
352 |             self.chr_batch_indexes = {}
353 |             for k,c in enumerate(self.chroms):
354 |                 num_batches = int(np.ceil(size_s[c] / batch_size))
355 |                 batch_indexes = [[chr_ix[k]+i*batch_size, chr_ix[k]+(i+1)*batch_size] for i in range(num_batches)]
356 |                 batch_indexes[-1][1] = chr_ix[k+1]
357 |                 self.chr_batch_indexes[c] = batch_indexes
358 | 
359 |     def __len__(self):
360 |         return self.num_batches
361 | 
362 |     @background(max_prefetch=6)
363 |     def generate_data(self, chrom=None, verbose=False, enum_start=1):
364 |         """Generate batches from genotype data"""
365 |         if chrom is None:
366 |             batch_indexes = self.batch_indexes
367 |             num_batches = self.num_batches
368 |         else:
369 |             batch_indexes = self.chr_batch_indexes[chrom]
370 |             num_batches = np.sum([len(i) for i in self.chr_batch_indexes.values()])
371 | 
372 |         for k,i in enumerate(batch_indexes, enum_start):  # loop through batches
373 |             if verbose:
374 |                 print_progress(k, num_batches, 'batch')
375 |             g = self.genotype_df.values[i[0]:i[1]]
376 |             ix = self.genotype_df.index[i[0]:i[1]]  # variant IDs
377 |             yield g, ix
378 | 
379 | 
380 | def get_cis_ranges(phenotype_pos_df, chr_variant_dfs, window, verbose=True):
381 |     """
382 | 
383 |     start, end indexes (inclusive)
384 |     """
385 |     # check phenotypes & calculate genotype ranges
386 |     # get genotype indexes corresponding to cis-window of each phenotype
387 |     if 'pos' in phenotype_pos_df:
388 |         phenotype_pos_df = phenotype_pos_df.rename(columns={'pos':'start'})
389 |         phenotype_pos_df['end'] = phenotype_pos_df['start']
390 |     phenotype_pos_dict = phenotype_pos_df.to_dict(orient='index')
391 | 
392 |     drop_ids = []
393 |     cis_ranges = {}
394 |     n = len(phenotype_pos_df)
395 |     for k, phenotype_id in enumerate(phenotype_pos_df.index, 1):
396 |         if verbose and (k % 1000 == 0 or k == n):
397 |             print(f'\r  * checking phenotypes: {k}/{n}',  end='' if k != n else None)
398 | 
399 |         pos = phenotype_pos_dict[phenotype_id]
400 |         chrom = pos['chr']
401 |         m = len(chr_variant_dfs[chrom]['pos'].values)
402 |         lb = bisect.bisect_left(chr_variant_dfs[chrom]['pos'].values, pos['start'] - window)
403 |         ub = bisect.bisect_right(chr_variant_dfs[chrom]['pos'].values, pos['end'] + window)
404 |         if lb != ub:
405 |             r = chr_variant_dfs[chrom]['index'].values[[lb, ub - 1]]
406 |         else:
407 |             r = []
408 | 
409 |         if len(r) > 0:
410 |             cis_ranges[phenotype_id] = r
411 |         else:
412 |             drop_ids.append(phenotype_id)
413 | 
414 |     return cis_ranges, drop_ids
415 | 
416 | 
417 | class InputGeneratorCis(object):
418 |     """
419 |     Input generator for cis-mapping
420 | 
421 |     Inputs:
422 |       genotype_df:      genotype DataFrame (genotypes x samples)
423 |       variant_df:       DataFrame mapping variant_id (index) to chrom, pos
424 |       phenotype_df:     phenotype DataFrame (phenotypes x samples)
425 |       phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end']
426 |       window:           cis-window; selects variants within +- cis-window from 'pos' (e.g., TSS for gene-based features)
427 |                         or within [start-window, end+window] if 'start' and 'end' are present in phenotype_pos_df
428 | 
429 |     Generates: phenotype array, genotype array (2D), cis-window indices, phenotype ID
430 |     """
431 |     def __init__(self, genotype_df, variant_df, phenotype_df, phenotype_pos_df, group_s=None, window=1000000):
432 |         assert (genotype_df.index == variant_df.index).all()
433 |         assert (phenotype_df.index == phenotype_df.index.unique()).all()
434 |         self.genotype_df = genotype_df
435 |         self.variant_df = variant_df.copy()
436 |         self.variant_df['index'] = np.arange(variant_df.shape[0])
437 |         self.n_samples = phenotype_df.shape[1]
438 | 
439 |         # drop phenotypes without genotypes on same contig
440 |         variant_chrs = variant_df['chrom'].unique()
441 |         phenotype_chrs = phenotype_pos_df['chr'].unique()
442 |         self.chrs = [i for i in phenotype_chrs if i in variant_chrs]
443 |         m = phenotype_pos_df['chr'].isin(self.chrs)
444 |         if any(~m):
445 |             print(f'    ** dropping {sum(~m)} phenotypes on chrs. without genotypes')
446 |         self.phenotype_df = phenotype_df[m]
447 |         self.phenotype_pos_df = phenotype_pos_df[m]
448 | 
449 |         # check for constant phenotypes and drop
450 |         m = np.all(self.phenotype_df.values == self.phenotype_df.values[:,[0]], 1)
451 |         if m.any():
452 |             print(f'    ** dropping {np.sum(m)} constant phenotypes')
453 |             self.phenotype_df = self.phenotype_df.loc[~m]
454 |             self.phenotype_pos_df = self.phenotype_pos_df.loc[~m]
455 | 
456 |         if len(self.phenotype_df) == 0:
457 |             raise ValueError("No phenotypes remain after filters.")
458 | 
459 |         self.group_s = None
460 |         self.window = window
461 | 
462 |         self.chr_variant_dfs = {c:g[['pos', 'index']] for c,g in self.variant_df.groupby('chrom')}
463 | 
464 |         # check phenotypes & calculate genotype ranges
465 |         # get genotype indexes corresponding to cis-window of each phenotype
466 |         self.cis_ranges, drop_ids = get_cis_ranges(self.phenotype_pos_df, self.chr_variant_dfs, self.window)
467 |         if len(drop_ids) > 0:
468 |             print(f"    ** dropping {len(drop_ids)} phenotypes without variants in cis-window")
469 |             self.phenotype_df = self.phenotype_df.drop(drop_ids)
470 |             self.phenotype_pos_df = self.phenotype_pos_df.drop(drop_ids)
471 |         if 'pos' in self.phenotype_pos_df:
472 |             self.phenotype_start = self.phenotype_pos_df['pos'].to_dict()
473 |             self.phenotype_end = self.phenotype_start
474 |         else:
475 |             self.phenotype_start = self.phenotype_pos_df['start'].to_dict()
476 |             self.phenotype_end = self.phenotype_pos_df['end'].to_dict()
477 |         self.n_phenotypes = self.phenotype_df.shape[0]
478 | 
479 |         if group_s is not None:
480 |             self.group_s = group_s.loc[self.phenotype_df.index].copy()
481 |             self.n_groups = self.group_s.unique().shape[0]
482 | 
483 | 
484 |     @background(max_prefetch=6)
485 |     def generate_data(self, chrom=None, verbose=False):
486 |         """
487 |         Generate batches from genotype data
488 | 
489 |         Returns: phenotype array, genotype matrix, genotype index, phenotype ID(s), [group ID]
490 |         """
491 |         if chrom is None:
492 |             phenotype_ids = self.phenotype_df.index
493 |             chr_offset = 0
494 |         else:
495 |             phenotype_ids = self.phenotype_pos_df[self.phenotype_pos_df['chr'] == chrom].index
496 |             if self.group_s is None:
497 |                 offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'], return_index=True))}
498 |             else:
499 |                 offset_dict = {i:j for i,j in zip(*np.unique(self.phenotype_pos_df['chr'][self.group_s.drop_duplicates().index], return_index=True))}
500 |             chr_offset = offset_dict[chrom]
501 | 
502 |         index_dict = {j:i for i,j in enumerate(self.phenotype_df.index)}
503 | 
504 |         if self.group_s is None:
505 |             for k,phenotype_id in enumerate(phenotype_ids, chr_offset+1):
506 |                 if verbose:
507 |                     print_progress(k, self.n_phenotypes, 'phenotype')
508 |                 p = self.phenotype_df.values[index_dict[phenotype_id]]
509 |                 # p = self.phenotype_df.values[k]
510 |                 r = self.cis_ranges[phenotype_id]
511 |                 yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), phenotype_id
512 |         else:
513 |             gdf = self.group_s[phenotype_ids].groupby(self.group_s, sort=False)
514 |             for k,(group_id,g) in enumerate(gdf, chr_offset+1):
515 |                 if verbose:
516 |                     print_progress(k, self.n_groups, 'phenotype group')
517 |                 # check that ranges are the same for all phenotypes within group
518 |                 assert np.all([self.cis_ranges[g.index[0]][0] == self.cis_ranges[i][0] and self.cis_ranges[g.index[0]][1] == self.cis_ranges[i][1] for i in g.index[1:]])
519 |                 group_phenotype_ids = g.index.tolist()
520 |                 # p = self.phenotype_df.loc[group_phenotype_ids].values
521 |                 p = self.phenotype_df.values[[index_dict[i] for i in group_phenotype_ids]]
522 |                 r = self.cis_ranges[g.index[0]]
523 |                 yield p, self.genotype_df.values[r[0]:r[-1]+1], np.arange(r[0],r[-1]+1), group_phenotype_ids, group_id
524 | 
525 | 
526 | def get_chunk_size(memory_gb, samples):
527 |     """"""
528 |     return memory_gb * 1024**3 // samples
529 | 
530 | 
531 | def generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, chunk_size, window=1000000,
532 |                            dosages=False, verbose=True):
533 |     """
534 |     Generate paired genotype-phenotype chunks for large datasets where only a subset of
535 |     genotypes can be loaded into memory.
536 | 
537 |     pgr: pgen.PgenReader
538 |     phenotype_df:     phenotype DataFrame (phenotypes x samples)
539 |     phenotype_pos_df: DataFrame defining position of each phenotype, with columns ['chr', 'pos'] or ['chr', 'start', 'end']
540 |     chunk_size: maximum number of variants to load into CPU memory
541 |     window: cis-window
542 |     dosages: load dosages (DS) from genotype files (default: GT)
543 |     """
544 |     variant_df = pgr.pvar_df.set_index('id')[['chrom', 'pos']]
545 |     cis_ranges, _ = get_cis_ranges(phenotype_pos_df, pgr.variant_dfs, window)
546 |     range_df = pd.DataFrame(cis_ranges, index=['start', 'end']).T
547 |     range_df = range_df.join(phenotype_pos_df['chr'])
548 | 
549 |     if chunk_size == 'chr':
550 |         chrlen_s = range_df['chr'].value_counts(sort=False)
551 |         start_ixs = [0] + chrlen_s.cumsum().tolist()
552 |     else:
553 |         chunk_size = int(chunk_size)
554 |         # check chunk size
555 |         max_cis_var = (range_df['end'] - range_df['start'] + 1).max()
556 |         if not max_cis_var <= chunk_size:
557 |             raise ValueError(f"Max. chunk size must be at least largest cis-window ({max_cis_var})")
558 | 
559 |         start_ixs = [0]
560 |         while start_ixs[-1] < range_df.shape[0]:
561 |             end_ix = bisect.bisect_left(range_df['end'].values, range_df['start'].values[start_ixs[-1]] + chunk_size)
562 |             start_ixs.append(end_ix)
563 |         start_ixs[-1] = range_df.shape[0]
564 | 
565 |     nchunks = len(start_ixs) - 1
566 |     for ci in range(nchunks):
567 |         if verbose:
568 |             print(f"Processing genotype-phenotype chunk {ci+1}/{nchunks}")
569 |         ix = slice(start_ixs[ci], start_ixs[ci+1])
570 |         chunk_df = range_df[ix]
571 |         if chunk_size == 'chr':
572 |             assert (chunk_df['chr'] == chrlen_s.index[ci]).all()
573 |         if dosages:
574 |             gt_df = pgr.read_dosages_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], dtype=np.float32)
575 |         else:
576 |             gt_df = pgr.read_range(chunk_df['start'].values[0], chunk_df['end'].values[-1], impute_mean=False, dtype=np.int8)
577 |         var_df = variant_df.iloc[chunk_df['start'].values[0]:chunk_df['end'].values[-1]+1]
578 |         yield gt_df, var_df, phenotype_df[ix], phenotype_pos_df[ix], ci
579 | 


--------------------------------------------------------------------------------
/tensorqtl/mixqtl.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import os
 4 | import sys
 5 | sys.path.insert(1, os.path.dirname(__file__))
 6 | import cis
 7 | from core import *
 8 | 
 9 | 
10 | def trc(genotypes_t, counts_t, covariates_t=None, select_covariates=True,
11 |         count_threshold=0, imputation='offset', mode='standard', return_af=False):
12 |     """
13 |     Inputs
14 |       genotypes_t: dosages (variants x samples)
15 |       counts_t: DESeq size factor-normalized read counts
16 |       covariates_t: covariates matrix, first column must be intercept
17 |       mode: if 'standard', parallel regression for each variant in genotypes_t
18 |             if 'multi', multiple regression for all variants in genotypes_t
19 | 
20 |     Outputs:
21 |       t-statistic, beta, beta_se {af, ma_samples, ma_counts}  (mode='standard')
22 |       beta, beta_se  (mode='multi')
23 |     """
24 |     nonzero_t = counts_t != 0
25 | 
26 |     if imputation == 'offset':
27 |         log_counts_t = counts_t.log1p()
28 |     elif imputation == 'half_min':
29 |         log_counts_t = counts_t.clone()
30 |         log_counts_t[~nonzero_t] = log_counts_t[nonzero_t].min() / 2
31 |         log_counts_t = log_counts_t.log()
32 | 
33 |     if covariates_t is not None:
34 |         if select_covariates:
35 |             # select significant covariates
36 |             b_t, b_se_t = linreg(covariates_t[nonzero_t, :], log_counts_t[nonzero_t], dtype=torch.float32)
37 |             tstat_t = b_t / b_se_t
38 |             m = tstat_t.abs() > 2
39 |             m[0] = True  # keep intercept
40 |             sel_covariates_t = covariates_t[:, m]
41 |         else:
42 |             sel_covariates_t = covariates_t
43 | 
44 |         # Regress out covariates from non-zero counts, and keep zeros.
45 |         # This follows the original mixQTL implementation, but may be
46 |         # problematic when count_threshold is 0.
47 |         residualizer = Residualizer(sel_covariates_t[nonzero_t, 1:])  # exclude intercept
48 |         y_t = counts_t.clone()
49 |         y_t[nonzero_t] = residualizer.transform(log_counts_t[nonzero_t].reshape(1,-1), center=True)
50 |     else:
51 |         y_t = log_counts_t
52 | 
53 |     m_t = counts_t >= count_threshold
54 | 
55 |     if mode == 'standard':
56 |         res = cis.calculate_cis_nominal(genotypes_t[:, m_t] / 2, y_t[m_t], return_af=False)
57 |         if return_af:
58 |             af, ma_samples, ma_counts = get_allele_stats(genotypes_t)
59 |             return *res, af, ma_samples, ma_counts
60 |         else:
61 |             return res
62 | 
63 |     elif mode.startswith('multi'):
64 |         X_t = torch.cat([torch.ones([m_t.sum(), 1], dtype=bool).to(genotypes_t.device), genotypes_t[:, m_t].T / 2], axis=1)
65 |         b_t, b_se_t = linreg(X_t, y_t[m_t], dtype=torch.float32)
66 |         return b_t[1:], b_se_t[1:]
67 | 


--------------------------------------------------------------------------------
/tensorqtl/pgen.py:
--------------------------------------------------------------------------------
  1 | # Functions for reading dosages from PLINK pgen files based on the Pgenlib Python API:
  2 | # https://github.com/chrchang/plink-ng/blob/master/2.0/Python/python_api.txt
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pgenlib as pg
  7 | import os
  8 | import bisect
  9 | 
 10 | 
 11 | def read_pvar(pvar_path):
 12 |     """Read pvar file as pd.DataFrame"""
 13 |     return pd.read_csv(pvar_path, sep='\t', comment='#',
 14 |                        names=['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'],
 15 |                        dtype={'chrom':str, 'pos':np.int32, 'id':str, 'ref':str, 'alt':str,
 16 |                               'qual':str, 'filter':str, 'info':str})
 17 | 
 18 | 
 19 | def read_psam(psam_path):
 20 |     """Read psam file as pd.DataFrame"""
 21 |     psam_df = pd.read_csv(psam_path, sep='\t', index_col=0)
 22 |     psam_df.index = psam_df.index.astype(str)
 23 |     return psam_df
 24 | 
 25 | 
 26 | def hardcall_phase_present(pgen_path):
 27 |     """Returns True iff phased hardcalls may be present"""
 28 |     with pg.PgenReader(pgen_path.encode()) as r:
 29 |         return r.hardcall_phase_present()
 30 | 
 31 | 
 32 | def get_reader(pgen_path, sample_subset=None):
 33 |     """"""
 34 |     if sample_subset is not None:
 35 |         sample_subset = np.array(sample_subset, dtype=np.uint32)
 36 |     reader = pg.PgenReader(pgen_path.encode(), sample_subset=sample_subset)
 37 |     if sample_subset is None:
 38 |         num_samples = reader.get_raw_sample_ct()
 39 |     else:
 40 |         num_samples = len(sample_subset)
 41 |     return reader, num_samples
 42 | 
 43 | 
 44 | def read(pgen_path, variant_idx, sample_subset=None, dtype=np.int8):
 45 |     """
 46 |     Get genotypes for a variant.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     pgen_path : str
 51 |         Path of PLINK 2 pgen file
 52 |     variant_idx : int
 53 |         Variant index
 54 |     sample_subset : array_like
 55 |         List of sample indexes to select. Must be sorted.
 56 |     dtype : np.int{8,32,64}
 57 |         Data type of the returned array.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     dosages : ndarray
 62 |         Genotypes (as {0, 1, 2, -9}) for the selected variant and samples.
 63 |     """
 64 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
 65 |     genotypes = np.zeros(num_samples, dtype=dtype)
 66 |     with reader as r:
 67 |         r.read(np.array(variant_idx, dtype=np.uint32), genotypes)
 68 |     return genotypes
 69 | 
 70 | 
 71 | def read_dosages(pgen_path, variant_idx, sample_subset=None, dtype=np.float32):
 72 |     """
 73 |     Get dosages for a variant.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     pgen_path : str
 78 |         Path of PLINK 2 pgen file
 79 |     variant_idx : int
 80 |         Variant index
 81 |     sample_subset : array_like
 82 |         List of sample indexes to select. Must be sorted.
 83 |     dtype : np.float{32,64}
 84 |         Data type of the returned array.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     dosages : ndarray
 89 |         Genotype dosages for the selected variant and samples.
 90 |     """
 91 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
 92 |     dosages = np.zeros(num_samples, dtype=dtype)
 93 |     with reader as r:
 94 |         r.read_dosages(np.array(variant_idx, dtype=np.uint32), dosages)
 95 |     return dosages
 96 | 
 97 | 
 98 | def read_alleles(pgen_path, variant_idx, sample_subset=None):
 99 |     """
100 |     Get alleles for a variant.
101 | 
102 |     Parameters
103 |     ----------
104 |     pgen_path : str
105 |         Path of PLINK 2 pgen file
106 |     variant_idx : int
107 |         Variant index
108 |     sample_subset : array_like
109 |         List of sample indexes to select. Must be sorted.
110 | 
111 |     Returns
112 |     -------
113 |     alleles: ndarray (2 * sample_ct)
114 |         Alleles for the selected variant and samples.
115 |         Elements 2n and 2n+1 correspond to sample n.
116 |         Both elements are -9 for missing genotypes.
117 |         If the genotype is unphased, the lower index appears first.
118 |     """
119 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
120 |     alleles = np.zeros(2*num_samples, dtype=np.int32)
121 |     with reader as r:
122 |         r.read_alleles(np.array(variant_idx, dtype=np.uint32), alleles)
123 |     return alleles
124 | 
125 | 
126 | def read_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.int8):
127 |     """
128 |     Get genotypes for a list of variants.
129 | 
130 |     Parameters
131 |     ----------
132 |     pgen_path : str
133 |         Path of PLINK 2 pgen file
134 |     variant_idxs : array_like
135 |         List of variant indexes
136 |     sample_subset : array_like
137 |         List of sample indexes to select. Must be sorted.
138 |     dtype : np.int{8,32,64}
139 |         Data type of the returned array.
140 | 
141 |     Returns
142 |     -------
143 |     dosages : ndarray
144 |         Genotypes for the selected variants and samples.
145 |     """
146 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
147 |     num_variants = len(variant_idxs)
148 |     genotypes = np.zeros([num_variants, num_samples], dtype=dtype)
149 |     with reader as r:
150 |         r.read_list(np.array(variant_idxs, dtype=np.uint32), genotypes)
151 |     return genotypes
152 | 
153 | 
154 | def read_dosages_list(pgen_path, variant_idxs, sample_subset=None, dtype=np.float32):
155 |     """
156 |     Get dosages for a list of variants.
157 | 
158 |     Parameters
159 |     ----------
160 |     pgen_path : str
161 |         Path of PLINK 2 pgen file
162 |     variant_idxs : array_like
163 |         List of variant indexes
164 |     sample_subset : array_like
165 |         List of sample indexes to select. Must be sorted.
166 |     dtype : np.float{32,64}
167 |         Data type of the returned array.
168 | 
169 |     Returns
170 |     -------
171 |     dosages : ndarray
172 |         Genotype dosages for the selected variants and samples.
173 |     """
174 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
175 |     num_variants = len(variant_idxs)
176 |     dosages = np.zeros([num_variants, num_samples], dtype=dtype)
177 |     with reader as r:
178 |         r.read_dosages_list(np.array(variant_idxs, dtype=np.uint32), dosages)
179 |     return dosages
180 | 
181 | 
182 | def read_alleles_list(pgen_path, variant_idxs, sample_subset=None):
183 |     """
184 |     Get alleles for a list of variants.
185 | 
186 |     Parameters
187 |     ----------
188 |     pgen_path : str
189 |         Path of PLINK 2 pgen file
190 |     variant_idxs : array_like
191 |         List of variant indexes
192 |     sample_subset : array_like
193 |         List of sample indexes to select. Must be sorted.
194 | 
195 |     Returns
196 |     -------
197 |     alleles : ndarray
198 |         Alleles for the selected variants and samples.
199 |     """
200 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
201 |     num_variants = len(variant_idxs)
202 |     alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32)
203 |     with reader as r:
204 |         r.read_alleles_list(np.array(variant_idxs, dtype=np.uint32), alleles)
205 |     return alleles
206 | 
207 | 
208 | def read_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.int8):
209 |     """
210 |     Get genotypes for a range of variants.
211 | 
212 |     Parameters
213 |     ----------
214 |     pgen_path : str
215 |         Path of PLINK 2 pgen file
216 |     start_idx : int
217 |         Start index of the range to query.
218 |     end_idx : int
219 |         End index of the range to query (inclusive).
220 |     sample_subset : array_like
221 |         List of sample indexes to select. Must be sorted.
222 |     dtype : np.int{8,32,64}
223 |         Data type of the returned array.
224 | 
225 |     Returns
226 |     -------
227 |     dosages : ndarray
228 |         Genotypes for the selected variants and samples.
229 |     """
230 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
231 |     num_variants = end_idx - start_idx + 1
232 |     genotypes = np.zeros([num_variants, num_samples], dtype=dtype)
233 |     with reader as r:
234 |         r.read_range(start_idx, end_idx+1, genotypes)
235 |     return genotypes
236 | 
237 | 
238 | def read_dosages_range(pgen_path, start_idx, end_idx, sample_subset=None, dtype=np.float32):
239 |     """
240 |     Get dosages for a range of variants.
241 | 
242 |     Parameters
243 |     ----------
244 |     pgen_path : str
245 |         Path of PLINK 2 pgen file
246 |     start_idx : int
247 |         Start index of the range to query.
248 |     end_idx : int
249 |         End index of the range to query (inclusive).
250 |     sample_subset : array_like
251 |         List of sample indexes to select. Must be sorted.
252 |     dtype : np.float{32,64}
253 |         Data type of the returned array.
254 | 
255 |     Returns
256 |     -------
257 |     dosages : ndarray
258 |         Genotype dosages for the selected variants and samples.
259 |     """
260 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
261 |     num_variants = end_idx - start_idx + 1
262 |     dosages = np.zeros([num_variants, num_samples], dtype=dtype)
263 |     with reader as r:
264 |         r.read_dosages_range(start_idx, end_idx+1, dosages)
265 |     return dosages
266 | 
267 | 
268 | def read_alleles_range(pgen_path, start_idx, end_idx, sample_subset=None):
269 |     """
270 |     Get alleles for a range of variants.
271 | 
272 |     Parameters
273 |     ----------
274 |     pgen_path : str
275 |         Path of PLINK 2 pgen file
276 |     start_idx : int
277 |         Start index of the range to query.
278 |     end_idx : int
279 |         End index of the range to query (inclusive).
280 |     sample_subset : array_like
281 |         List of sample indexes to select. Must be sorted.
282 | 
283 |     Returns
284 |     -------
285 |     alleles : ndarray
286 |         Alleles for the selected variants and samples.
287 |     """
288 |     reader, num_samples = get_reader(pgen_path, sample_subset=sample_subset)
289 |     num_variants = end_idx - start_idx + 1
290 |     alleles = np.zeros([num_variants, 2*num_samples], dtype=np.int32)
291 |     with reader as r:
292 |         r.read_alleles_range(start_idx, end_idx+1, alleles)
293 |     return alleles
294 | 
295 | 
296 | def _impute_mean(genotypes):
297 |     """Impute missing genotypes to mean"""
298 |     m = genotypes == -9
299 |     if genotypes.ndim == 1 and any(m):
300 |         genotypes[m] = genotypes[~m].mean()
301 |     else:  # genotypes.ndim == 2
302 |         ix = np.nonzero(m)[0]
303 |         if len(ix) > 0:
304 |             a = genotypes.sum(1)
305 |             b = m.sum(1)
306 |             mu = (a + 9*b) / (genotypes.shape[1] - b)
307 |             genotypes[m] = mu[ix]
308 | 
309 | 
310 | class PgenReader(object):
311 |     """
312 |     Class for reading genotype data from PLINK 2 pgen files
313 | 
314 |     To generate the pgen/psam/pvar files from a VCF, run
315 |         plink2 --vcf ${vcf_file} --output-chr chrM --out ${plink_prefix_path}
316 |     To use dosages, run:
317 |         plink2 --vcf ${vcf_file} 'dosage=DS' --output-chr chrM --out ${plink_prefix_path}
318 | 
319 |     Requires pgenlib: https://github.com/chrchang/plink-ng/tree/master/2.0/Python
320 |     """
321 |     def __init__(self, plink_prefix_path, select_samples=None):
322 |         """
323 |         plink_prefix_path: prefix to PLINK pgen,psam,pvar files
324 |         select_samples: specify a subset of samples
325 |         """
326 | 
327 |         if os.path.exists(f"{plink_prefix_path}.pvar.parquet"):
328 |             self.pvar_df = pd.read_parquet(f"{plink_prefix_path}.pvar.parquet")
329 |         else:
330 |             self.pvar_df = read_pvar(f"{plink_prefix_path}.pvar")
331 |         self.psam_df = read_psam(f"{plink_prefix_path}.psam")
332 |         self.pgen_file = f"{plink_prefix_path}.pgen"
333 | 
334 |         self.num_variants = self.pvar_df.shape[0]
335 |         self.variant_ids = self.pvar_df['id'].tolist()
336 |         self.variant_idx_dict = {i:k for k,i in enumerate(self.variant_ids)}
337 | 
338 |         self.sample_id_list = self.psam_df.index.tolist()
339 |         self.set_samples(select_samples)
340 | 
341 |         variant_df = self.pvar_df.set_index('id')[['chrom', 'pos']]
342 |         variant_df['index'] = np.arange(variant_df.shape[0])
343 |         self.variant_df = variant_df
344 |         self.variant_dfs = {c:g[['pos', 'index']] for c,g in variant_df.groupby('chrom', sort=False)}
345 | 
346 |     def set_samples(self, sample_ids=None, sort=True):
347 |         """
348 |         Set samples to load.
349 | 
350 |         Parameters
351 |         ----------
352 |         sample_ids : array_like
353 |             List of samples to select.
354 |         sort : bool
355 |             Preserve sample order from pgen file.
356 |         """
357 |         if sample_ids is None:
358 |             self.sample_ids = self.sample_id_list
359 |             self.sample_idxs = None
360 |         else:
361 |             sample_idxs = [self.sample_id_list.index(i) for i in sample_ids]
362 |             if sort:
363 |                 sidx = np.argsort(sample_idxs)
364 |                 sample_idxs = [sample_idxs[i] for i in sidx]
365 |                 sample_ids = [sample_ids[i] for i in sidx]
366 |             self.sample_ids = sample_ids
367 |             self.sample_idxs = sample_idxs
368 | 
369 |     def get_range(self, region, start=None, end=None):
370 |         """
371 |         Get variant indexes corresponding to region specified as 'chr:start-end', or as chr, start, end.
372 | 
373 |         Parameters
374 |         ----------
375 |         region : str
376 |             Genomic region, defined as 'chr:start-end' (1-based, inclusive), or chromosome.
377 |         start : int
378 |             Start position of the genomic interval (if chromosome is provided in fist argument).
379 |         end : int
380 |             End position of the genomic interval (if chromosome is provided in fist argument).
381 | 
382 |         Returns
383 |         -------
384 |         indexes : ndarray
385 |             [start, end] indexes (inclusive)
386 |         """
387 |         if start is None and end is None:
388 |             if ':' in region:
389 |                 chrom, pos = region.split(':')
390 |                 start, end = [int(i) for i in pos.split('-')]
391 |             else:  # full chromosome selected
392 |                 chrom = region
393 |                 return self.variant_dfs[chrom]['index'].values[[0, -1]]
394 |         else:  # input is chr, start, end
395 |             chrom = region
396 | 
397 |         lb = bisect.bisect_left(self.variant_dfs[chrom]['pos'].values, start)
398 |         ub = bisect.bisect_right(self.variant_dfs[chrom]['pos'].values, end)
399 |         if lb != ub:
400 |             r = self.variant_dfs[chrom]['index'].values[[lb, ub - 1]]
401 |         else:
402 |             r = []
403 |         return r
404 | 
405 |     def read(self, variant_id, impute_mean=True, dtype=np.float32):
406 |         """Read genotypes for an individual variant as 0,1,2,-9; impute missing values (-9) to mean (default)."""
407 |         variant_idx = self.variant_idx_dict[variant_id]
408 |         genotypes = read(self.pgen_file, variant_idx, sample_subset=self.sample_idxs,
409 |                          dtype=np.int8).astype(dtype)
410 |         if impute_mean:
411 |             _impute_mean(genotypes)
412 |         return pd.Series(genotypes, index=self.sample_ids, name=variant_id)
413 | 
414 |     def read_list(self, variant_ids, impute_mean=True, dtype=np.float32):
415 |         """Read genotypes for an list of variants as 0,1,2,-9; impute missing values (-9) to mean (default)."""
416 |         variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
417 |         genotypes = read_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs,
418 |                               dtype=np.int8).astype(dtype)
419 |         if impute_mean:
420 |             _impute_mean(genotypes)
421 |         return pd.DataFrame(genotypes, index=variant_ids, columns=self.sample_ids)
422 | 
423 |     def read_range(self, start_idx, end_idx, impute_mean=True, dtype=np.float32):
424 |         """Read genotypes for range of variants as 0,1,2,-9; impute missing values (-9) to mean (default)."""
425 |         genotypes = read_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs,
426 |                                dtype=np.int8).astype(dtype)
427 |         if impute_mean:
428 |             _impute_mean(genotypes)
429 |         return pd.DataFrame(genotypes, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
430 | 
431 |     def read_region(self, region, start_pos=None, end_pos=None, impute_mean=True, dtype=np.float32):
432 |         """Read genotypes for variants in a genomic region as 0,1,2,-9; impute missing values (-9) to mean (default)."""
433 |         r = self.get_range(region, start_pos, end_pos)
434 |         if len(r) > 0:
435 |             return self.read_range(*r, impute_mean=impute_mean, dtype=dtype)
436 | 
437 |     def read_dosages(self, variant_id, dtype=np.float32):
438 |         variant_idx = self.variant_idx_dict[variant_id]
439 |         dosages = read_dosages(self.pgen_file, variant_idx, sample_subset=self.sample_idxs, dtype=dtype)
440 |         return pd.Series(dosages, index=self.sample_ids, name=variant_id)
441 | 
442 |     def read_dosages_list(self, variant_ids, dtype=np.float32):
443 |         variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
444 |         dosages = read_dosages_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs, dtype=dtype)
445 |         return pd.DataFrame(dosages, index=variant_ids, columns=self.sample_ids)
446 | 
447 |     def read_dosages_range(self, start_idx, end_idx, dtype=np.float32):
448 |         dosages = read_dosages_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs, dtype=dtype)
449 |         return pd.DataFrame(dosages, index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
450 | 
451 |     def read_dosages_region(self, region, start_pos=None, end_pos=None, dtype=np.float32):
452 |         r = self.get_range(region, start_pos, end_pos)
453 |         if len(r) > 0:
454 |             return self.read_dosages_range(*r, dtype=dtype)
455 | 
456 |     def read_alleles(self, variant_id):
457 |         variant_idx = self.variant_idx_dict[variant_id]
458 |         alleles = read_alleles(self.pgen_file, variant_idx, sample_subset=self.sample_idxs)
459 |         s1 = pd.Series(alleles[::2],  index=self.sample_ids, name=variant_id)
460 |         s2 = pd.Series(alleles[1::2], index=self.sample_ids, name=variant_id)
461 |         return s1, s2
462 | 
463 |     def read_alleles_list(self, variant_ids):
464 |         variant_idxs = [self.variant_idx_dict[i] for i in variant_ids]
465 |         alleles = read_alleles_list(self.pgen_file, variant_idxs, sample_subset=self.sample_idxs)
466 |         df1 = pd.DataFrame(alleles[:,::2],  index=variant_ids, columns=self.sample_ids)
467 |         df2 = pd.DataFrame(alleles[:,1::2], index=variant_ids, columns=self.sample_ids)
468 |         return df1, df2
469 | 
470 |     def read_alleles_range(self, start_idx, end_idx):
471 |         alleles = read_alleles_range(self.pgen_file, start_idx, end_idx, sample_subset=self.sample_idxs)
472 |         df1 = pd.DataFrame(alleles[:,::2],  index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
473 |         df2 = pd.DataFrame(alleles[:,1::2], index=self.variant_ids[start_idx:end_idx+1], columns=self.sample_ids)
474 |         return df1, df2
475 | 
476 |     def read_alleles_region(self, region, start_pos=None, end_pos=None):
477 |         r = self.get_range(region, start_pos, end_pos)
478 |         if len(r) > 0:
479 |             return self.read_alleles_range(*r)
480 |         else:
481 |             return None, None
482 | 
483 |     def load_genotypes(self):
484 |         """Load all genotypes as np.int8, without imputing missing values."""
485 |         genotypes = read_range(self.pgen_file, 0, self.num_variants-1, sample_subset=self.sample_idxs)
486 |         return pd.DataFrame(genotypes, index=self.variant_ids, columns=self.sample_ids)
487 | 
488 |     def load_dosages(self):
489 |         """Load all dosages."""
490 |         return self.read_dosages_range(0, self.num_variants-1)
491 | 
492 |     def load_alleles(self):
493 |         """Load all alleles."""
494 |         return self.read_alleles_range(0, self.num_variants-1)
495 | 
496 |     def get_pairwise_ld(self, id1, id2, r2=True, dtype=np.float32):
497 |         """Compute pairwise LD (R2) between (lists of) variants"""
498 |         if isinstance(id1, str) and isinstance(id2, str):
499 |             g1 = self.read(id1, dtype=dtype)
500 |             g2 = self.read(id2, dtype=dtype)
501 |             g1 -= g1.mean()
502 |             g2 -= g2.mean()
503 |             if r2:
504 |                 r = (g1 * g2).sum()**2 / ( (g1**2).sum() * (g2**2).sum() )
505 |             else:
506 |                 r = (g1 * g2).sum() / np.sqrt( (g1**2).sum() * (g2**2).sum() )
507 |         elif isinstance(id1, str):
508 |             g1 = self.read(id1, dtype=dtype)
509 |             g2 = self.read_list(id2, dtype=dtype)
510 |             g1 -= g1.mean()
511 |             g2 -= g2.values.mean(1, keepdims=True)
512 |             if r2:
513 |                 r = (g1 * g2).sum(1)**2 / ( (g1**2).sum() * (g2**2).sum(1) )
514 |             else:
515 |                 r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum() * (g2**2).sum(1) )
516 |         elif isinstance(id2, str):
517 |             g1 = self.read_list(id1, dtype=dtype)
518 |             g2 = self.read(id2, dtype=dtype)
519 |             g1 -= g1.values.mean(1, keepdims=True)
520 |             g2 -= g2.mean()
521 |             if r2:
522 |                 r = (g1 * g2).sum(1)**2 / ( (g1**2).sum(1) * (g2**2).sum() )
523 |             else:
524 |                 r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum() )
525 |         else:
526 |             assert len(id1) == len(id2)
527 |             g1 = self.read_list(id1, dtype=dtype).values
528 |             g2 = self.read_list(id2, dtype=dtype).values
529 |             g1 -= g1.mean(1, keepdims=True)
530 |             g2 -= g2.mean(1, keepdims=True)
531 |             if r2:
532 |                 r = (g1 * g2).sum(1) ** 2 / ( (g1**2).sum(1) * (g2**2).sum(1) )
533 |             else:
534 |                 r = (g1 * g2).sum(1) / np.sqrt( (g1**2).sum(1) * (g2**2).sum(1) )
535 |         return r
536 | 
537 |     def get_ld_matrix(self, variant_ids, dtype=np.float32):
538 |         g = self.read_list(variant_ids, dtype=dtype).values
539 |         return pd.DataFrame(np.corrcoef(g), index=variant_ids, columns=variant_ids)
540 | 
541 | 
542 | def load_dosages_df(plink_prefix_path, select_samples=None):
543 |     """
544 |     Load dosages for all variants and all/selected samples as a dataframe.
545 | 
546 |     Parameters
547 |     ----------
548 |     plink_prefix_path : str
549 |         Prefix to .pgen/.psam/.pvar files
550 |     select_samples : array_like
551 |         List of sample IDs to select. Default: all samples.
552 | 
553 |     Returns
554 |     -------
555 |     dosages_df : pd.DataFrame (variants x samples)
556 |         Genotype dosages for the selected samples.
557 |     """
558 |     p = Pgen(plink_prefix_path, select_samples=select_samples)
559 |     return p.load_dosages_df()
560 | 


--------------------------------------------------------------------------------
/tensorqtl/post.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | import scipy.stats as stats
  5 | import subprocess
  6 | import sys
  7 | import os
  8 | import glob
  9 | from datetime import datetime
 10 | 
 11 | sys.path.insert(1, os.path.dirname(__file__))
 12 | from core import *
 13 | import mixqtl
 14 | import qtl.genotype as gt
 15 | 
 16 | 
 17 | def calculate_qvalues(res_df, fdr=0.05, qvalue_lambda=None, logger=None):
 18 |     """Annotate permutation results with q-values, p-value threshold"""
 19 |     if logger is None:
 20 |         logger = SimpleLogger()
 21 | 
 22 |     logger.write('Computing q-values')
 23 |     logger.write(f'  * Number of phenotypes tested: {res_df.shape[0]}')
 24 | 
 25 |     if not res_df['pval_beta'].isnull().all():
 26 |         pval_col = 'pval_beta'
 27 |         r = stats.pearsonr(res_df['pval_perm'], res_df['pval_beta'])[0]
 28 |         logger.write(f'  * Correlation between Beta-approximated and empirical p-values: {r:.4f}')
 29 |     else:
 30 |         pval_col = 'pval_perm'
 31 |         logger.write(f'  * WARNING: no beta-approximated p-values found, using permutation p-values instead.')
 32 | 
 33 |     # calculate q-values
 34 |     if qvalue_lambda is not None:
 35 |         logger.write(f'  * Calculating q-values with lambda = {qvalue_lambda:.3f}')
 36 |     qval, pi0 = rfunc.qvalue(res_df[pval_col], lambda_qvalue=qvalue_lambda)
 37 | 
 38 |     res_df['qval'] = qval
 39 |     logger.write(f'  * Proportion of significant phenotypes (1-pi0): {1-pi0:.2f}')
 40 |     logger.write(f"  * QTL phenotypes @ FDR {fdr:.2f}: {(res_df['qval'] <= fdr).sum()}")
 41 | 
 42 |     # determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
 43 |     if pval_col == 'pval_beta':
 44 |         lb = res_df.loc[res_df['qval'] <= fdr, 'pval_beta'].sort_values()
 45 |         ub = res_df.loc[res_df['qval'] > fdr, 'pval_beta'].sort_values()
 46 | 
 47 |         if len(lb) > 0:  # significant phenotypes
 48 |             lb = lb.iloc[-1]
 49 |             if len(ub) > 0:
 50 |                 ub = ub.iloc[0]
 51 |                 pthreshold = (lb+ub)/2
 52 |             else:
 53 |                 pthreshold = lb
 54 |             logger.write(f'  * min p-value threshold @ FDR {fdr}: {pthreshold:.6g}')
 55 |             res_df['pval_nominal_threshold'] = stats.beta.ppf(pthreshold, res_df['beta_shape1'], res_df['beta_shape2'])
 56 | 
 57 | 
 58 | def calculate_afc(assoc_df, counts_df, genotype_df, variant_df=None, covariates_df=None,
 59 |                   select_covariates=True, group='gene_id',
 60 |                   imputation='offset', count_threshold=0, verbose=True):
 61 |     """
 62 |     Calculate allelic fold-change (aFC) for variant-gene pairs
 63 | 
 64 |     Inputs
 65 |       assoc_df: dataframe containing variant-gene associations, must have 'gene_id'
 66 |                 and 'variant_id' columns. If multiple variants/gene are detected, effects
 67 |                 are estimated jointly.
 68 |       genotype_df: genotype dosages
 69 |       counts_df: read counts scaled with DESeq size factors. Zeros are imputed using
 70 |                  log(counts + 1) (imputation='offset'; default) or with half-minimum
 71 |                  (imputation='half_min').
 72 |       covariates_df: covariates (genotype PCs, PEER factors, etc.)
 73 | 
 74 |     aFC [1] is computed using the total read count (trc) model from mixQTL [2].
 75 | 
 76 |       [1] Mohammadi et al., 2017 (genome.cshlp.org/content/27/11/1872)
 77 |       [2] Liang et al., 2021 (10.1038/s41467-021-21592-8)
 78 |     """
 79 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 80 | 
 81 |     if variant_df is not None:
 82 |         gi = gt.GenotypeIndexer(genotype_df, variant_df)
 83 |     else:
 84 |         assert isinstance(genotype_df, gt.GenotypeIndexer)
 85 |         gi = genotype_df
 86 |     genotype_ix = np.array([gi.genotype_df.columns.tolist().index(i) for i in counts_df.columns])
 87 |     genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
 88 | 
 89 |     if covariates_df is not None:
 90 |         covariates_t = torch.tensor(covariates_df.values, dtype=torch.float32).to(device)
 91 |     else:
 92 |         covariates_t = None
 93 | 
 94 |     afc_df = []
 95 |     n = len(assoc_df[group].unique())
 96 |     for k, (phenotype_id, gdf) in enumerate(assoc_df.groupby(group, sort=False), 1):
 97 |         if verbose and k % 10 == 0 or k == n:
 98 |             print(f"\rCalculating aFC for {group.replace('_id','')} {k}/{n}", end='' if k != n else None, flush=True)
 99 | 
100 |         counts_t = torch.tensor(counts_df.loc[phenotype_id].values,
101 |                                 dtype=torch.float32).to(device)
102 |         genotypes_t = torch.tensor(gi.get_genotypes(gdf['variant_id'].tolist()), dtype=torch.float32).to(device)
103 |         genotypes_t = genotypes_t[:,genotype_ix_t]
104 |         impute_mean(genotypes_t)
105 |         try:
106 |             b, b_se = mixqtl.trc(genotypes_t, counts_t, covariates_t=covariates_t,
107 |                                  select_covariates=select_covariates, count_threshold=count_threshold,
108 |                                  imputation=imputation, mode='multi', return_af=False)
109 |             gdf['afc'] = b.cpu().numpy() * np.log2(np.e)
110 |             gdf['afc_se'] = b_se.cpu().numpy() * np.log2(np.e)
111 |             afc_df.append(gdf)
112 |         except:
113 |             print(f'WARNING: aFC calculation failed for {phenotype_id}')
114 |     afc_df = pd.concat(afc_df)
115 | 
116 |     return afc_df
117 | 
118 | 
119 | def calculate_replication(res_df, genotypes, phenotype_df, covariates_df=None, paired_covariate_df=None,
120 |                           interaction_s=None, compute_pi1=False, lambda_qvalue=None, logp=False):
121 |     """res_df: DataFrame with 'variant_id' column and phenotype IDs as index"""
122 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
123 | 
124 |     if paired_covariate_df is not None:
125 |         assert paired_covariate_df.index.equals(covariates_df.index)
126 |         assert paired_covariate_df.columns.isin(phenotype_df.index).all()
127 | 
128 |     if isinstance(genotypes, pd.DataFrame):
129 |         genotypes_t = torch.tensor(genotypes.loc[res_df['variant_id']].values, dtype=torch.float).to(device)
130 |         genotype_ix = np.array([genotypes.columns.tolist().index(i) for i in phenotype_df.columns])
131 |     else:  # pgen.PgenReader
132 |         gt_df = genotypes.read_list(res_df['variant_id'], impute_mean=False)
133 |         genotypes_t =  torch.tensor(gt_df.values, dtype=torch.float).to(device)
134 |         genotype_ix = np.array([gt_df.columns.tolist().index(i) for i in phenotype_df.columns])
135 | 
136 |     genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
137 |     genotypes_t = genotypes_t[:,genotype_ix_t]
138 |     impute_mean(genotypes_t)
139 |     af_t, ma_samples_t, ma_count_t = get_allele_stats(genotypes_t)
140 | 
141 |     phenotypes_t = torch.tensor(phenotype_df.loc[res_df.index].values, dtype=torch.float32).to(device)
142 | 
143 |     if covariates_df is not None:
144 |         residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
145 |         # dof -= covariates_df.shape[1]
146 |     else:
147 |         residualizer = None
148 | 
149 |     if interaction_s is None:
150 |         if paired_covariate_df is None:
151 |             if residualizer is not None:
152 |                 genotype_res_t = residualizer.transform(genotypes_t)  # variants x samples
153 |                 phenotype_res_t = residualizer.transform(phenotypes_t)  # phenotypes x samples
154 |                 dof = residualizer.dof
155 |                 dof_t = dof
156 |             else:
157 |                 genotype_res_t = genotypes_t
158 |                 phenotype_res_t = phenotypes_t
159 |                 dof =  phenotypes_t.shape[1] - 2
160 |                 dof_t = dof
161 |         else:
162 |             genotype_res_t = torch.zeros_like(genotypes_t).to(device)
163 |             phenotype_res_t = torch.zeros_like(phenotypes_t).to(device)
164 |             dof = []
165 |             for k,phenotype_id in enumerate(res_df.index):
166 |                 if phenotype_id in paired_covariate_df:
167 |                     iresidualizer = Residualizer(torch.tensor(np.c_[covariates_df, paired_covariate_df[phenotype_id]],
168 |                                                               dtype=torch.float32).to(device))
169 |                 else:
170 |                     iresidualizer = residualizer
171 |                 genotype_res_t[[k]] = iresidualizer.transform(genotypes_t[[k]])
172 |                 phenotype_res_t[[k]] = iresidualizer.transform(phenotypes_t[[k]])
173 |                 dof.append(iresidualizer.dof)
174 |             dof = np.array(dof)
175 |             dof_t = torch.Tensor(dof).to(device)
176 | 
177 |         gstd = genotype_res_t.var(1)
178 |         pstd = phenotype_res_t.var(1)
179 |         std_ratio_t = torch.sqrt(pstd / gstd)
180 | 
181 |         # center and normalize
182 |         genotype_res_t = center_normalize(genotype_res_t, dim=1)
183 |         phenotype_res_t = center_normalize(phenotype_res_t, dim=1)
184 | 
185 |         r_nominal_t = (genotype_res_t * phenotype_res_t).sum(1)
186 |         r2_nominal_t = r_nominal_t.double().pow(2)
187 | 
188 |         tstat_t = torch.sqrt((dof_t * r2_nominal_t) / (1 - r2_nominal_t))
189 |         slope_t = r_nominal_t * std_ratio_t
190 |         slope_se_t = (slope_t.abs().double() / tstat_t).float()
191 |         pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof)
192 | 
193 |         rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(), pval, slope_t.cpu(), slope_se_t.cpu()],
194 |                               columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af', 'pval_nominal', 'slope', 'slope_se']).infer_objects()
195 | 
196 |     else:
197 |         if paired_covariate_df is not None:
198 |             raise NotImplementedError("Paired covariates are not yet supported for interactions")
199 | 
200 |         interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device)
201 |         ng, ns = genotypes_t.shape
202 |         nps = phenotypes_t.shape[0]
203 | 
204 |         # centered inputs
205 |         g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
206 |         gi_t = genotypes_t * interaction_t
207 |         gi0_t = gi_t - gi_t.mean(1, keepdim=True)
208 |         i0_t = interaction_t - interaction_t.mean()
209 |         p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)
210 | 
211 |         # residualize rows
212 |         g0_t = residualizer.transform(g0_t, center=False)
213 |         gi0_t = residualizer.transform(gi0_t, center=False)
214 |         p0_t = residualizer.transform(p0_t, center=False)  # np x ns
215 |         i0_t = residualizer.transform(i0_t, center=False)
216 |         i0_t = i0_t.repeat(ng, 1)
217 | 
218 |         # regression (in float; loss of precision may occur in edge cases)
219 |         X_t = torch.stack([g0_t, i0_t, gi0_t], 2)  # ng x ns x 3
220 |         Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3
221 |         b_t = (torch.matmul(Xinv, torch.transpose(X_t, 1, 2)) * p0_t.unsqueeze(1)).sum(2) # ng x 3
222 |         r_t = (X_t * b_t.unsqueeze(1)).sum(2) - p0_t
223 |         dof = residualizer.dof - 2
224 |         rss_t = (r_t*r_t).sum(1)  # ng x np
225 |         b_se_t = torch.sqrt( Xinv[:, torch.eye(3, dtype=torch.uint8).bool()] * rss_t.unsqueeze(-1) / dof )
226 |         tstat_t = (b_t.double() / b_se_t.double()).float()
227 |         pval = 2*stats.t.cdf(-np.abs(tstat_t.cpu()), dof)
228 |         b = b_t.cpu()
229 |         b_se = b_se_t.cpu()
230 | 
231 |         rep_df = pd.DataFrame(np.c_[res_df.index, res_df['variant_id'], ma_samples_t.cpu(), ma_count_t.cpu(), af_t.cpu(),
232 |                                     pval[:,0], b[:,0], b_se[:,0], pval[:,1], b[:,1], b_se[:,1], pval[:,2], b[:,2], b_se[:,2]],
233 |                               columns=['phenotype_id', 'variant_id', 'ma_samples', 'ma_count', 'af',
234 |                                        'pval_g', 'b_g', 'b_g_se', 'pval_i', 'b_i', 'b_i_se', 'pval_gi', 'b_gi', 'b_gi_se']).infer_objects()
235 |         pval = pval[:,2]
236 | 
237 |     if compute_pi1:
238 |         try:
239 |             pi1 = 1 - rfunc.pi0est(pval, lambda_qvalue=lambda_qvalue)[0]
240 |         except:
241 |             pi1 = np.nan
242 |         return pi1, rep_df
243 |     else:
244 |         return rep_df
245 | 
246 | 
247 | def annotate_genes(gene_df, annotation_gtf, lookup_df=None):
248 |     """
249 |     Add gene and variant annotations (e.g., gene_name, rs_id, etc.) to gene-level output
250 | 
251 |     gene_df:        output from map_cis()
252 |     annotation_gtf: gene annotation in GTF format
253 |     lookup_df:      DataFrame with variant annotations, indexed by 'variant_id'
254 |     """
255 |     gene_dict = {}
256 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Adding gene and variant annotations', flush=True)
257 |     print('  * parsing GTF', flush=True)
258 |     with open(annotation_gtf) as gtf:
259 |         for row in gtf:
260 |             row = row.strip().split('\t')
261 |             if row[0][0] == '#' or row[2] != 'gene': continue
262 |             # get gene_id and gene_name from attributes
263 |             attr = dict([i.split() for i in row[8].replace('"','').split(';') if i!=''])
264 |             # gene_name, gene_chr, gene_start, gene_end, strand
265 |             gene_dict[attr['gene_id']] = [attr['gene_name'], row[0], row[3], row[4], row[6]]
266 | 
267 |     print('  * annotating genes', flush=True)
268 |     if 'group_id' in gene_df:
269 |         gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df['group_id']],
270 |                                  columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'],
271 |                                  index=gene_df.index)
272 |     else:
273 |         gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df.index],
274 |                                  columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'],
275 |                                  index=gene_df.index)
276 |     gene_df = pd.concat([gene_info, gene_df], axis=1)
277 |     assert np.all(gene_df.index == gene_info.index)
278 | 
279 |     col_order = ['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand',
280 |         'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id']
281 |     if 'tss_distance' in gene_df:
282 |         col_order += ['tss_distance']
283 |     else:
284 |         col_order += ['start_distance', 'end_distance']
285 |     if lookup_df is not None:
286 |         print('  * adding variant annotations from lookup table', flush=True)
287 |         gene_df = gene_df.join(lookup_df, on='variant_id')  # add variant information
288 |         col_order += list(lookup_df.columns)
289 |     col_order += ['ma_samples', 'ma_count', 'af', 'pval_nominal',
290 |                   'slope', 'slope_se', 'pval_perm', 'pval_beta']
291 |     if 'group_id' in gene_df:
292 |         col_order += ['group_id', 'group_size']
293 |     col_order += ['qval', 'pval_nominal_threshold']
294 |     gene_df = gene_df[col_order]
295 |     print('done.', flush=True)
296 |     return gene_df
297 | 
298 | 
299 | def get_significant_pairs(res_df, nominal_files, group_s=None, fdr=0.05):
300 |     """Significant variant-phenotype pairs based on nominal p-value threshold for each phenotype"""
301 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] tensorQTL: parsing all significant variant-phenotype pairs', flush=True)
302 |     assert 'qval' in res_df
303 | 
304 |     # significant phenotypes (apply FDR threshold)
305 |     if group_s is not None:
306 |         df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta', 'group_id']].copy()
307 |         df.set_index('group_id', inplace=True)
308 |     else:
309 |         df = res_df.loc[res_df['qval'] <= fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta']].copy()
310 |     df.rename(columns={'pval_nominal': 'min_pval_nominal'}, inplace=True)
311 |     signif_phenotype_ids = set(df.index)
312 |     threshold_dict = df['pval_nominal_threshold'].to_dict()
313 | 
314 |     if isinstance(nominal_files, str):
315 |         # chr -> file
316 |         nominal_files = {os.path.basename(i).split('.')[-2]:i for i in glob.glob(nominal_files+'*.parquet')}
317 |     else:
318 |         assert isinstance(nominal_files, dict)
319 | 
320 |     chroms = sorted(nominal_files.keys(), key=lambda x: int(x.replace('chr', '').replace('X', '23')))
321 |     signif_df = []
322 |     for k,c in enumerate(chroms, 1):
323 |         print(f'  * processing chr. {k}/{len(chroms)}', end='\r', flush=True)
324 |         nominal_df = pd.read_parquet(nominal_files[c])
325 |         # drop pairs that never pass threshold
326 |         nominal_df = nominal_df[nominal_df['pval_nominal'] <= df['pval_nominal_threshold'].max()]
327 |         if group_s is not None:
328 |             nominal_df.insert(1, 'group_id', nominal_df['phenotype_id'].map(group_s))
329 |             nominal_df = nominal_df[nominal_df['group_id'].isin(signif_phenotype_ids)]
330 |             m = nominal_df['pval_nominal'] < nominal_df['group_id'].apply(lambda x: threshold_dict[x])
331 |         else:
332 |             nominal_df = nominal_df[nominal_df['phenotype_id'].isin(signif_phenotype_ids)]
333 |             m = nominal_df['pval_nominal'] < nominal_df['phenotype_id'].apply(lambda x: threshold_dict[x])
334 |         signif_df.append(nominal_df[m])
335 |     print()
336 |     signif_df = pd.concat(signif_df, axis=0)
337 |     if group_s is not None:
338 |         signif_df = signif_df.merge(df, left_on='group_id', right_index=True)
339 |     else:
340 |         signif_df = signif_df.merge(df, left_on='phenotype_id', right_index=True)
341 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] done', flush=True)
342 |     return signif_df.reset_index(drop=True)
343 | 


--------------------------------------------------------------------------------
/tensorqtl/rfunc.py:
--------------------------------------------------------------------------------
 1 | # Author: Francois Aguet
 2 | import numpy as np
 3 | import rpy2
 4 | from rpy2.robjects.packages import importr
 5 | from collections.abc import Iterable
 6 | from contextlib import contextmanager
 7 | 
 8 | # silence R warnings
 9 | from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
10 | import logging
11 | rpy2_logger.setLevel(logging.ERROR)
12 | 
13 | @contextmanager
14 | def suppress_stdout():
15 |     with open(os.devnull, "w") as devnull:
16 |         old_stdout = sys.stdout
17 |         sys.stdout = devnull
18 |         try:
19 |             yield
20 |         finally:
21 |             sys.stdout = old_stdout
22 | 
23 | 
24 | def p_adjust(p, method='BH'):
25 |     """Wrapper for p.adjust"""
26 |     rp = rpy2.robjects.vectors.FloatVector(p)
27 |     p_adjust = rpy2.robjects.r['p.adjust']
28 |     return np.array(p_adjust(rp, method=method))
29 | 
30 | 
31 | def t_cdf(t, df, lower_tail=False, log=True):
32 |     """Wrapper for pt"""
33 |     scalar = True
34 |     if isinstance(t, Iterable):
35 |         rt = rpy2.robjects.vectors.FloatVector(t)
36 |         scalar = False
37 |     else:
38 |         rt = t
39 |     if isinstance(df, Iterable):
40 |         rdf = rpy2.robjects.vectors.FloatVector(df)
41 |         scalar = False
42 |     else:
43 |         rdf = df
44 |     r_pt = rpy2.robjects.r['pt']
45 |     res = np.array(r_pt(rt, rdf, lower_tail=lower_tail, log=log))
46 |     if scalar:
47 |         res = res[0]
48 |     return res
49 | 
50 | 
51 | def qvalue(p, lambda_qvalue=None):
52 |     """Wrapper for qvalue::qvalue"""
53 |     qvalue = importr("qvalue")
54 |     rp = rpy2.robjects.vectors.FloatVector(p)
55 |     if lambda_qvalue is None:
56 |         q = qvalue.qvalue(rp)
57 |     else:
58 |         if not isinstance(lambda_qvalue, Iterable):
59 |             lambda_qvalue = [lambda_qvalue]
60 |         rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue)
61 |         q = qvalue.qvalue(rp, **{'lambda':rlambda})
62 |     qval = np.array(q.rx2('qvalues'))
63 |     pi0 = np.array(q.rx2('pi0'))[0]
64 |     return qval, pi0
65 | 
66 | 
67 | def pi0est(p, lambda_qvalue=None):
68 |     """Wrapper for qvalue::pi0est"""
69 |     qvalue = importr("qvalue")
70 |     rp = rpy2.robjects.vectors.FloatVector(p)
71 |     # with suppress_stdout():
72 |     if lambda_qvalue is None:
73 |         pi0res = qvalue.pi0est(rp)
74 |     else:
75 |         if not isinstance(lambda_qvalue, Iterable):
76 |             lambda_qvalue = [lambda_qvalue]
77 |         rlambda = rpy2.robjects.vectors.FloatVector(lambda_qvalue)
78 |         pi0res = qvalue.pi0est(rp, rlambda)
79 |     pi0 = np.array(pi0res.rx2('pi0'))[0]
80 |     pi0_lambda = np.array(pi0res.rx2('pi0.lambda'))
81 |     lambda_vec = np.array(pi0res.rx2('lambda'))
82 |     pi0_smooth = np.array(pi0res.rx2('pi0.smooth'))
83 |     return pi0, pi0_lambda, lambda_vec, pi0_smooth
84 | 


--------------------------------------------------------------------------------
/tensorqtl/tensorqtl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from __future__ import print_function
  3 | import pandas as pd
  4 | import numpy as np
  5 | from datetime import datetime
  6 | import sys
  7 | import os
  8 | import re
  9 | import pickle
 10 | import argparse
 11 | from collections import defaultdict
 12 | import importlib.metadata
 13 | 
 14 | sys.path.insert(1, os.path.dirname(__file__))
 15 | from core import *
 16 | from post import *
 17 | import genotypeio, cis, trans, susie
 18 | 
 19 | 
 20 | def main():
 21 |     parser = argparse.ArgumentParser(description='tensorQTL: GPU-based QTL mapper')
 22 |     parser.add_argument('genotype_path', help='Genotypes in PLINK format')
 23 |     parser.add_argument('phenotypes', help="Phenotypes in BED format (.bed, .bed.gz, .bed.parquet), or optionally for 'trans' mode, parquet or tab-delimited.")
 24 |     parser.add_argument('prefix', help='Prefix for output file names')
 25 |     parser.add_argument('--mode', type=str, default='cis', choices=['cis', 'cis_nominal', 'cis_independent', 'cis_susie', 'trans', 'trans_susie'],
 26 |                         help='Mapping mode. Default: cis')
 27 |     parser.add_argument('--covariates', default=None, help='Covariates file, tab-delimited (covariates x samples)')
 28 |     parser.add_argument('--paired_covariate', default=None, help='Single phenotype-specific covariate, tab-delimited (phenotypes x samples)')
 29 |     parser.add_argument('--permutations', type=int, default=10000, help='Number of permutations. Default: 10000')
 30 |     parser.add_argument('--interaction', default=None, type=str, help='Tab-delimited file mapping sample ID to interaction value(s) (if multiple interaction terms are used, the file must include a header with variable names)')
 31 |     parser.add_argument('--cis_output', default=None, type=str, help="Output from 'cis' mode with q-values. Required for independent cis-QTL mapping.")
 32 |     parser.add_argument('--phenotype_groups', default=None, type=str, help='Phenotype groups. Header-less TSV with two columns: phenotype_id, group_id')
 33 |     parser.add_argument('--window', default=1000000, type=np.int32, help='Cis-window size, in bases. Default: 1000000.')
 34 |     parser.add_argument('--pval_threshold', default=1e-5, type=np.float64, help='Output only significant phenotype-variant pairs with a p-value below threshold. Default: 1e-5 for trans-QTL')
 35 |     parser.add_argument('--logp', action='store_true', help='Compute nominal p-values as -log10(P) for added precision (requires R)')
 36 |     parser.add_argument('--maf_threshold', default=0, type=np.float64, help='Include only genotypes with minor allele frequency >= maf_threshold. Default: 0')
 37 |     parser.add_argument('--maf_threshold_interaction', default=0.05, type=np.float64, help='MAF threshold for interactions, applied to lower and upper half of samples')
 38 |     parser.add_argument('--dosages', action='store_true', help='Load dosages instead of genotypes (only applies to PLINK2 bgen input).')
 39 |     parser.add_argument('--return_dense', action='store_true', help='Return dense output for trans-QTL.')
 40 |     parser.add_argument('--return_r2', action='store_true', help='Return r2 (only for sparse trans-QTL output)')
 41 |     parser.add_argument('--best_only', action='store_true', help='Only write lead association for each phenotype (interaction mode only)')
 42 |     parser.add_argument('--output_text', action='store_true', help='Write output in txt.gz format instead of parquet (trans-QTL mode only)')
 43 |     parser.add_argument('--batch_size', type=int, default=20000, help='GPU batch size (trans-QTLs only). Reduce this if encountering OOM errors.')
 44 |     parser.add_argument('--chunk_size', default=None, help="For cis-QTL mapping, load genotypes into CPU memory in chunks of chunk_size variants, or by chromosome if chunk_size is 'chr'.")
 45 |     parser.add_argument('--susie_loci', default=None, help="Table (parquet or tsv) with loci to fine-map (phenotype_id, chr, pos) with mode 'trans_susie'.")
 46 |     parser.add_argument('--disable_beta_approx', action='store_true', help='Disable Beta-distribution approximation of empirical p-values (not recommended).')
 47 |     parser.add_argument('--warn_monomorphic', action='store_true', help='Warn if monomorphic variants are found.')
 48 |     parser.add_argument('--max_effects', type=int, default=10, help='Maximum number of non-zero effects in the SuSiE regression model.')
 49 |     parser.add_argument('--fdr', default=0.05, type=np.float64, help='FDR for cis-QTLs')
 50 |     parser.add_argument('--qvalue_lambda', default=None, type=np.float64, help='lambda parameter for pi0est in qvalue.')
 51 |     parser.add_argument('--seed', default=None, type=int, help='Seed for permutations.')
 52 |     parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
 53 |     args = parser.parse_args()
 54 | 
 55 |     # check inputs
 56 |     if args.mode == 'cis_independent' and (args.cis_output is None or not os.path.exists(args.cis_output)):
 57 |         raise ValueError("Output from 'cis' mode must be provided.")
 58 |     if args.interaction is not None and args.mode not in ['cis_nominal', 'trans']:
 59 |         raise ValueError("Interactions are only supported in 'cis_nominal' or 'trans' mode.")
 60 | 
 61 |     logger = SimpleLogger(os.path.join(args.output_dir, f'{args.prefix}.tensorQTL.{args.mode}.log'))
 62 |     logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Running TensorQTL v{importlib.metadata.version("tensorqtl")}: {args.mode.split("_")[0]}-QTL mapping')
 63 |     if torch.cuda.is_available():
 64 |         logger.write(f'  * using GPU ({torch.cuda.get_device_name(torch.cuda.current_device())})')
 65 |     else:
 66 |         logger.write('  * WARNING: using CPU!')
 67 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 68 |     if args.seed is not None:
 69 |         logger.write(f'  * using seed {args.seed}')
 70 | 
 71 |     # load inputs
 72 |     logger.write(f'  * reading phenotypes ({args.phenotypes})')
 73 |     # for cis modes, require BED input with position information
 74 |     if args.mode.startswith('cis'):
 75 |         assert args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')), "For cis modes, phenotypes must be in BED format."
 76 |         phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes)
 77 |         if phenotype_pos_df.columns[1] == 'pos':
 78 |             logger.write(f"  * cis-window detected as position ± {args.window:,}")
 79 |         else:
 80 |             logger.write(f"  * cis-window detected as [start - {args.window:,}, end + {args.window:,}]")
 81 |     elif args.mode.startswith('trans'):
 82 |         if args.phenotypes.lower().endswith(('.bed', '.bed.gz', '.bed.parquet')):
 83 |             phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotypes)
 84 |         else:
 85 |             if args.phenotypes.endswith('.parquet'):
 86 |                 phenotype_df = pd.read_parquet(args.phenotypes)
 87 |             else:  # assume tab-delimited
 88 |                 phenotype_df = pd.read_csv(args.phenotypes, sep='\t', index_col=0)
 89 |             phenotype_pos_df = None
 90 | 
 91 |     if args.covariates is not None:
 92 |         logger.write(f'  * reading covariates ({args.covariates})')
 93 |         covariates_df = pd.read_csv(args.covariates, sep='\t', index_col=0).T
 94 |         assert phenotype_df.columns.equals(covariates_df.index)
 95 |     else:
 96 |         covariates_df = None
 97 | 
 98 |     if args.paired_covariate is not None:
 99 |         assert covariates_df is not None, f"Covariates matrix must be provided when using paired covariate"
100 |         paired_covariate_df = pd.read_csv(args.paired_covariate, sep='\t', index_col=0)  # phenotypes x samples
101 |         assert paired_covariate_df.index.isin(phenotype_df.index).all(), f"Paired covariate phenotypes must be present in phenotype matrix."
102 |         assert paired_covariate_df.columns.equals(phenotype_df.columns), f"Paired covariate samples must match samples in phenotype matrix."
103 |     else:
104 |         paired_covariate_df = None
105 | 
106 |     if args.interaction is not None:
107 |         logger.write(f'  * reading interaction term(s) ({args.interaction})')
108 |         # allow headerless input for single interactions
109 |         with open(args.interaction) as f:
110 |             f.readline()
111 |             s = f.readline().strip()
112 |         if len(s.split('\t')) == 2:  # index + value
113 |             interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0, header=None)
114 |         else:
115 |             interaction_df = pd.read_csv(args.interaction, sep='\t', index_col=0)
116 |         # select samples
117 |         assert covariates_df.index.isin(interaction_df.index).all()
118 |         interaction_df = interaction_df.loc[covariates_df.index].astype(np.float32)
119 |     else:
120 |         interaction_df = None
121 | 
122 |     if args.maf_threshold is None:
123 |         if args.mode == 'trans':
124 |             maf_threshold = 0.05
125 |         else:
126 |             maf_threshold = 0
127 |     else:
128 |         maf_threshold = args.maf_threshold
129 | 
130 |     if args.phenotype_groups is not None:
131 |         group_s = pd.read_csv(args.phenotype_groups, sep='\t', index_col=0, header=None).squeeze('columns').rename(None)
132 |         # verify sort order
133 |         group_dict = group_s.to_dict()
134 |         previous_group = ''
135 |         parsed_groups = 0
136 |         for i in phenotype_df.index:
137 |             if group_dict[i] != previous_group:
138 |                 parsed_groups += 1
139 |                 previous_group = group_dict[i]
140 |         if not parsed_groups == len(group_s.unique()):
141 |             raise ValueError('Groups defined in input do not match phenotype file (check sort order).')
142 |     else:
143 |         group_s = None
144 | 
145 |     # load genotypes
146 |     if args.chunk_size is None:  # load all genotypes into memory
147 |         logger.write(f'  * loading genotype dosages' if args.dosages else f'  * loading genotypes')
148 |         genotype_df, variant_df = genotypeio.load_genotypes(args.genotype_path, select_samples=phenotype_df.columns, dosages=args.dosages)
149 |         if variant_df is None:
150 |             assert not args.mode.startswith('cis'), f"Genotype data without variant positions is only supported for mode='trans'."
151 |     else:
152 |         if not all([os.path.exists(f"{args.genotype_path}.{ext}") for ext in ['pgen', 'psam', 'pvar']]):
153 |             raise ValueError("Processing in chunks requires PLINK 2 pgen/psam/pvar files.")
154 |         import pgen
155 |         pgr = pgen.PgenReader(args.genotype_path, select_samples=phenotype_df.columns)
156 | 
157 |     if args.mode == 'cis':
158 |         if args.chunk_size is None:
159 |             res_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df,
160 |                                  group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations,
161 |                                  window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold,
162 |                                  warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True)
163 |         else:
164 |             res_df = []
165 |             for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
166 |                                                                                    dosages=args.dosages, verbose=True):
167 |                 res_df.append(cis.map_cis(gt_df, var_df, p_df, p_pos_df, covariates_df=covariates_df,
168 |                                           group_s=group_s, paired_covariate_df=paired_covariate_df, nperm=args.permutations,
169 |                                           window=args.window, beta_approx=not args.disable_beta_approx, maf_threshold=maf_threshold,
170 |                                           warn_monomorphic=args.warn_monomorphic, logger=logger, seed=args.seed, verbose=True))
171 |             res_df = pd.concat(res_df)
172 |         logger.write('  * writing output')
173 |         if has_rpy2:
174 |             calculate_qvalues(res_df, fdr=args.fdr, qvalue_lambda=args.qvalue_lambda, logger=logger)
175 |         out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.txt.gz')
176 |         res_df.to_csv(out_file, sep='\t', float_format='%.6g')
177 | 
178 |     elif args.mode == 'cis_nominal':
179 |         if args.chunk_size is None:
180 |             cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, args.prefix, covariates_df=covariates_df,
181 |                             paired_covariate_df=paired_covariate_df, interaction_df=interaction_df,
182 |                             maf_threshold_interaction=args.maf_threshold_interaction,
183 |                             group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True,
184 |                             output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True)
185 |             # compute significant pairs
186 |             if args.cis_output is not None:
187 |                 cis_df = pd.read_csv(args.cis_output, sep='\t', index_col=0)
188 |                 nominal_prefix = os.path.join(args.output_dir, f'{args.prefix}.cis_qtl_pairs')
189 |                 signif_df = get_significant_pairs(cis_df, nominal_prefix, group_s=group_s, fdr=args.fdr)
190 |                 signif_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.cis_qtl.signif_pairs.parquet'))
191 | 
192 |         else:
193 |             chunks = []
194 |             for gt_df, var_df, p_df, p_pos_df, ci in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
195 |                                                                                        dosages=args.dosages, verbose=True):
196 |                 prefix = f"{args.prefix}.chunk{ci+1}"
197 |                 chunks.append(prefix)
198 |                 cis.map_nominal(gt_df, var_df, p_df, p_pos_df, prefix, covariates_df=covariates_df,
199 |                                 paired_covariate_df=paired_covariate_df, interaction_df=interaction_df,
200 |                                 maf_threshold_interaction=args.maf_threshold_interaction,
201 |                                 group_s=None, window=args.window, maf_threshold=maf_threshold, run_eigenmt=True,
202 |                                 output_dir=args.output_dir, write_top=True, write_stats=not args.best_only, logger=logger, verbose=True)
203 |             chunk_files = glob.glob(os.path.join(args.output_dir, f"{args.prefix}.chunk*.cis_qtl_pairs.*.parquet"))
204 |             if args.chunk_size == 'chr':  # remove redundant chunk ID from file names
205 |                 for f in chunk_files:
206 |                     x = re.findall(f"{args.prefix}\.(chunk\d+)", os.path.basename(f))
207 |                     assert len(x) == 1
208 |                     os.rename(f, f.replace(f"{x[0]}.", ""))
209 |             else:  # concatenate outputs by chromosome
210 |                 chunk_df = pd.DataFrame({
211 |                     'file': chunk_files,
212 |                     'chunk': [int(re.findall(f"{args.prefix}\.chunk(\d+)", os.path.basename(i))[0]) for i in chunk_files],
213 |                     'chr': [re.findall("\.cis_qtl_pairs\.(.*)\.parquet", os.path.basename(i))[0] for i in chunk_files],
214 |                 }).sort_values('chunk')
215 |                 for chrom, chr_df in chunk_df.groupby('chr', sort=False):
216 |                     print(f"\rConcatenating outputs for {chrom}", end='' if chrom != chunk_df['chr'].iloc[-1] else None)
217 |                     pd.concat([pd.read_parquet(f) for f in chr_df['file']]).reset_index(drop=True).to_parquet(
218 |                         os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_pairs.{chrom}.parquet"))
219 |                     for f in chr_df['file']:
220 |                         os.remove(f)
221 |             # concatenate interaction results
222 |             if interaction_df is not None:
223 |                 chunk_files = [os.path.join(args.output_dir, f"{c}.cis_qtl_top_assoc.txt.gz") for c in chunks]
224 |                 pd.concat([pd.read_csv(f, sep='\t', index_col=0, dtype=str) for f in chunk_files]).to_csv(
225 |                     os.path.join(args.output_dir, f"{args.prefix}.cis_qtl_top_assoc.txt.gz"), sep='\t')
226 |                 for f in chunk_files:
227 |                     os.remove(f)
228 | 
229 |     elif args.mode == 'cis_independent':
230 |         summary_df = pd.read_csv(args.cis_output, sep='\t', index_col=0)
231 |         summary_df.rename(columns={'minor_allele_samples':'ma_samples', 'minor_allele_count':'ma_count'}, inplace=True)
232 |         if args.chunk_size is None:
233 |             res_df = cis.map_independent(genotype_df, variant_df, summary_df, phenotype_df, phenotype_pos_df, covariates_df=covariates_df,
234 |                                          group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window,
235 |                                          maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True)
236 |         else:
237 |             res_df = []
238 |             for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
239 |                                                                                       dosages=args.dosages, verbose=True):
240 |                 res_df.append(cis.map_independent(gt_df, var_df, summary_df, p_df, p_pos_df, covariates_df=covariates_df,
241 |                                                   group_s=group_s, fdr=args.fdr, nperm=args.permutations, window=args.window,
242 |                                                   maf_threshold=maf_threshold, logger=logger, seed=args.seed, verbose=True))
243 |             res_df = pd.concat(res_df).reset_index(drop=True)
244 |         logger.write('  * writing output')
245 |         out_file = os.path.join(args.output_dir, f'{args.prefix}.cis_independent_qtl.txt.gz')
246 |         res_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g')
247 | 
248 |     elif args.mode == 'cis_susie':
249 |         if args.cis_output.endswith('.parquet'):
250 |             signif_df = pd.read_parquet(args.cis_output)
251 |         else:
252 |             signif_df = pd.read_csv(args.cis_output, sep='\t')
253 |         if 'qval' in signif_df:  # otherwise input is from get_significant_pairs
254 |             signif_df = signif_df[signif_df['qval'] <= args.fdr]
255 |         phenotype_ids = phenotype_df.index[phenotype_df.index.isin(signif_df['phenotype_id'].unique())]
256 |         phenotype_df = phenotype_df.loc[phenotype_ids]
257 |         phenotype_pos_df = phenotype_pos_df.loc[phenotype_ids]
258 |         if args.chunk_size is None:
259 |             summary_df, res = susie.map(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
260 |                                         covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects,
261 |                                         maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False)
262 |         else:
263 |             summary_df = []
264 |             res = {}
265 |             for gt_df, var_df, p_df, p_pos_df, _ in genotypeio.generate_paired_chunks(pgr, phenotype_df, phenotype_pos_df, args.chunk_size,
266 |                                                                                       dosages=args.dosages, verbose=True):
267 |                 chunk_summary_df, chunk_res = susie.map(gt_df, var_df, p_df, p_pos_df,
268 |                                                         covariates_df, paired_covariate_df=paired_covariate_df, L=args.max_effects,
269 |                                                         maf_threshold=maf_threshold, max_iter=500, window=args.window, summary_only=False)
270 |                 summary_df.append(chunk_summary_df)
271 |                 res |= chunk_res
272 |             summary_df = pd.concat(summary_df).reset_index(drop=True)
273 | 
274 |         summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet'))
275 |         with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f:
276 |             pickle.dump(res, f)
277 | 
278 |     elif args.mode == 'trans_susie':
279 |         assert args.susie_loci is not None
280 |         if args.susie_loci.endswith('.parquet'):
281 |             locus_df = pd.read_parquet(args.susie_loci)
282 |         else:
283 |             locus_df = pd.read_csv(args.susie_loci, sep='\t')
284 |         locus_df.rename(columns={'position':'pos'}, inplace=True)
285 |         if args.chunk_size is None:
286 |             assert variant_df is not None
287 |             summary_df, res = susie.map_loci(locus_df, genotype_df, variant_df, phenotype_df, covariates_df,
288 |                                              maf_threshold=maf_threshold, max_iter=500, window=args.window)
289 |         else:
290 |             raise NotImplementedError()
291 | 
292 |         summary_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.SuSiE_summary.parquet'))
293 |         with open(os.path.join(args.output_dir, f'{args.prefix}.SuSiE.pickle'), 'wb') as f:
294 |             pickle.dump(res, f)
295 | 
296 |     elif args.mode == 'trans':
297 |         return_sparse = not args.return_dense
298 |         if return_sparse:
299 |             logger.write(f'  * p-value threshold: {args.pval_threshold:.2g}')
300 | 
301 |         if interaction_df is not None:
302 |             if interaction_df.shape[1] > 1:
303 |                 raise NotImplementedError('trans-QTL mapping currently only supports a single interaction.')
304 |             else:
305 |                 interaction_df = interaction_df.squeeze('columns')
306 | 
307 |         if args.chunk_size is None:
308 |             pairs_df = trans.map_trans(genotype_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df,
309 |                                        return_sparse=return_sparse, pval_threshold=args.pval_threshold,
310 |                                        maf_threshold=maf_threshold, batch_size=args.batch_size,
311 |                                        return_r2=args.return_r2, logger=logger)
312 |             if args.return_dense:
313 |                 pval_df, b_df, b_se_df, af_s = pairs_df
314 |         else:
315 |             pairs_df = []
316 |             n, rem = np.divmod(pgr.num_variants, int(args.chunk_size))
317 |             bounds = [0] + n * [int(args.chunk_size)]
318 |             if rem != 0:
319 |                 bounds.append(rem)
320 |             bounds = np.cumsum(bounds)
321 |             nchunks = len(bounds)-1
322 |             for i in range(nchunks):
323 |                 print(f"Processing genotype chunk {i+1}/{nchunks}")
324 |                 if args.dosages:
325 |                     gt_df = pgr.read_dosages_range(bounds[i], bounds[i+1]-1, dtype=np.float32)
326 |                 else:
327 |                     gt_df = pgr.read_range(bounds[i], bounds[i+1]-1, impute_mean=False, dtype=np.int8)
328 |                 pairs_df.append(trans.map_trans(gt_df, phenotype_df, covariates_df=covariates_df, interaction_s=interaction_df,
329 |                                                 return_sparse=return_sparse, pval_threshold=args.pval_threshold,
330 |                                                 maf_threshold=maf_threshold, batch_size=args.batch_size,
331 |                                                 return_r2=args.return_r2, logger=logger))
332 |             pairs_df = pd.concat(pairs_df).reset_index(drop=True)
333 |             variant_df = pgr.variant_df
334 | 
335 |         if return_sparse:
336 |             if variant_df is not None and phenotype_pos_df is not None:
337 |                 logger.write('  * filtering out cis-QTLs (within +/-5Mb)')
338 |                 pairs_df = trans.filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000)
339 | 
340 |             logger.write('  * writing sparse output')
341 |             if not args.output_text:
342 |                 pairs_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.parquet'))
343 |             else:
344 |                 out_file = os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pairs.txt.gz')
345 |                 pairs_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g')
346 |         else:
347 |             logger.write('  * writing dense output')
348 |             pval_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_pval.parquet'))
349 |             b_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta.parquet'))
350 |             b_se_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_beta_se.parquet'))
351 |             af_s.to_frame().to_parquet(os.path.join(args.output_dir, f'{args.prefix}.trans_qtl_af.parquet'))
352 | 
353 |     logger.write(f'[{datetime.now().strftime("%b %d %H:%M:%S")}] Finished mapping')
354 | 
355 | 
356 | if __name__ == '__main__':
357 |     main()
358 | 


--------------------------------------------------------------------------------
/tensorqtl/trans.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.utils import data
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scipy.stats as stats
  6 | from collections import OrderedDict
  7 | import sys
  8 | import os
  9 | import time
 10 | 
 11 | sys.path.insert(1, os.path.dirname(__file__))
 12 | import genotypeio
 13 | from core import *
 14 | 
 15 | 
 16 | def _in_cis(chrom, pos, gene_id, pos_dict, window=1000000):
 17 |     """Test if a variant is within +/-window of a gene's TSS."""
 18 |     if chrom == pos_dict[gene_id]['chr']:
 19 |         gene_dict = pos_dict[gene_id]
 20 |         if 'pos' in gene_dict:
 21 |             start = gene_dict['pos']
 22 |             end = start
 23 |         else:
 24 |             start = gene_dict['start']
 25 |             end = gene_dict['end']
 26 |         if pos >= start - window and pos <= end + window:
 27 |             return True
 28 |         else:
 29 |             return False
 30 |     else:
 31 |         return False
 32 | 
 33 | 
 34 | def filter_cis(pairs_df, phenotype_pos_df, variant_df, window=5000000):
 35 |     """Filter out cis-QTLs
 36 | 
 37 |     Args:
 38 |         pairs_df: sparse output from map_trans()
 39 |         pos_dict: phenotype_id -> pos
 40 |         window: filter variants within +/-window of feature position (e.g., TSS for genes)
 41 |     """
 42 |     pos_dict = phenotype_pos_df.T.to_dict()
 43 |     variant_df = variant_df.loc[pairs_df['variant_id'].unique()].copy()
 44 |     variant_dict = {v:{'chrom':c, 'pos':p} for v,c,p in zip(variant_df.index, variant_df['chrom'], variant_df['pos'])}
 45 | 
 46 |     drop_ix = []
 47 |     for k,gene_id,variant_id in zip(pairs_df['phenotype_id'].index, pairs_df['phenotype_id'], pairs_df['variant_id']):
 48 |         if _in_cis(variant_dict[variant_id]['chrom'], variant_dict[variant_id]['pos'], gene_id, pos_dict, window=window):
 49 |             drop_ix.append(k)
 50 |     return pairs_df.drop(drop_ix)
 51 | 
 52 | 
 53 | def map_trans(genotype_df, phenotype_df, covariates_df=None, interaction_s=None,
 54 |               return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05,
 55 |               alleles=2, return_r2=False, batch_size=20000,
 56 |               logp=False, logger=None, verbose=True):
 57 |     """Run trans-QTL mapping
 58 | 
 59 |     Outputs (return_sparse == True):
 60 |       pval_df: DataFrame with columns variant_id, phenotype_id, pval, b, b_se, af
 61 |     Outputs (return_sparse == False):
 62 |       pval_df
 63 |       b_df
 64 |       b_se_df
 65 |       af_s
 66 |     """
 67 | 
 68 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 69 | 
 70 |     if logger is None:
 71 |         logger = SimpleLogger(verbose=verbose)
 72 | 
 73 |     variant_ids = genotype_df.index.tolist()
 74 |     variant_dict = {i:j for i,j in enumerate(variant_ids)}
 75 |     n_variants = len(variant_ids)
 76 |     n_samples = phenotype_df.shape[1]
 77 |     dof = n_samples - 2
 78 | 
 79 |     logger.write('trans-QTL mapping')
 80 |     logger.write(f'  * {n_samples} samples')
 81 |     logger.write(f'  * {phenotype_df.shape[0]} phenotypes')
 82 |     if covariates_df is not None:
 83 |         assert np.all(phenotype_df.columns==covariates_df.index)
 84 |         logger.write(f'  * {covariates_df.shape[1]} covariates')
 85 |         residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
 86 |         dof -= covariates_df.shape[1]
 87 |     else:
 88 |         residualizer = None
 89 |     logger.write(f'  * {n_variants} variants')
 90 |     if interaction_s is not None:
 91 |         logger.write('  * including interaction term')
 92 | 
 93 |     phenotypes_t = torch.tensor(phenotype_df.values, dtype=torch.float32).to(device)
 94 |     genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in phenotype_df.columns])
 95 |     genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
 96 | 
 97 |     # calculate correlation threshold for sparse output
 98 |     if return_sparse:
 99 |         tstat_threshold = -stats.t.ppf(pval_threshold/2, dof)
100 |         r_threshold = tstat_threshold / np.sqrt(dof + tstat_threshold**2)
101 |     else:
102 |         tstat_threshold = None
103 |         r_threshold = None
104 | 
105 |     if interaction_s is None:
106 |         ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
107 |         start_time = time.time()
108 |         res = []
109 |         n_variants = 0
110 |         for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
111 |             # copy genotypes to GPU
112 |             genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
113 | 
114 |             # filter by MAF
115 |             genotypes_t = genotypes_t[:,genotype_ix_t]
116 |             impute_mean(genotypes_t)
117 |             genotypes_t, variant_ids, af_t = filter_maf(genotypes_t, variant_ids, maf_threshold)
118 |             n_variants += genotypes_t.shape[0]
119 | 
120 |             r_t, genotype_var_t, phenotype_var_t = calculate_corr(genotypes_t, phenotypes_t, residualizer=residualizer, return_var=True)
121 |             del genotypes_t
122 | 
123 |             if return_sparse:
124 |                 m = r_t.abs() >= r_threshold
125 |                 ix_t = m.nonzero(as_tuple=False)  # sparse index
126 |                 ix = ix_t.cpu().numpy()
127 | 
128 |                 r_t = r_t.masked_select(m).type(torch.float64)
129 |                 r2_t = r_t.pow(2)
130 |                 tstat_t = r_t * torch.sqrt(dof / (1 - r2_t))
131 |                 std_ratio_t = torch.sqrt(phenotype_var_t[ix_t[:,1]] / genotype_var_t[ix_t[:,0]])
132 |                 b_t = r_t * std_ratio_t
133 |                 b_se_t = (b_t / tstat_t).type(torch.float32)
134 | 
135 |                 res.append(np.c_[
136 |                     variant_ids[ix[:,0]], phenotype_df.index[ix[:,1]],
137 |                     tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(),
138 |                     r2_t.float().cpu(), af_t[ix_t[:,0]].cpu()
139 |                 ])
140 |             else:  # dense output: pval, b, b_se, af
141 |                 r_t = r_t.type(torch.float64)
142 |                 tstat_t = r_t * torch.sqrt(dof / (1 - r_t.pow(2)))
143 |                 std_ratio_t = torch.sqrt(phenotype_var_t / genotype_var_t.reshape(-1,1))
144 |                 b_t = (r_t * std_ratio_t).type(torch.float32)
145 |                 b_se_t = (b_t / tstat_t).type(torch.float32)
146 |                 res.append([variant_ids, tstat_t.cpu(), b_t.cpu(), b_se_t.cpu(), af_t.cpu()])
147 | 
148 |         logger.write(f'    elapsed time: {(time.time()-start_time)/60:.2f} min')
149 |         del phenotypes_t
150 |         del residualizer
151 | 
152 |         if maf_threshold > 0:
153 |             logger.write(f'  * {n_variants} variants passed MAF >= {maf_threshold} filtering')
154 | 
155 |         # post-processing: concatenate batches
156 |         if return_sparse:
157 |             res = np.concatenate(res)
158 |             res[:,2] = get_t_pval(res[:,2].astype(np.float64), dof, log=logp)
159 |             pval_df = pd.DataFrame(res, columns=['variant_id', 'phenotype_id', 'pval', 'b', 'b_se', 'r2', 'af'])
160 |             pval_df['pval'] = pval_df['pval'].astype(np.float64)
161 |             pval_df['b'] = pval_df['b'].astype(np.float32)
162 |             pval_df['b_se'] = pval_df['b_se'].astype(np.float32)
163 |             pval_df['r2'] = pval_df['r2'].astype(np.float32)
164 |             pval_df['af'] = pval_df['af'].astype(np.float32)
165 |             if not return_r2:
166 |                 pval_df.drop('r2', axis=1, inplace=True)
167 |             logger.write('done.')
168 |             return pval_df
169 |         else:
170 |             variant_ids = pd.Series(np.concatenate([i[0] for i in res]), name='variant_id')
171 |             pval_df = pd.DataFrame(get_t_pval(np.concatenate([i[1] for i in res]).astype(np.float64), dof, log=logp),
172 |                                    index=variant_ids, columns=phenotype_df.index)
173 |             b_df = pd.DataFrame(np.concatenate([i[2] for i in res]),
174 |                                 index=variant_ids, columns=phenotype_df.index)
175 |             b_se_df = pd.DataFrame(np.concatenate([i[3] for i in res]),
176 |                                 index=variant_ids, columns=phenotype_df.index)
177 |             af_s = pd.Series(np.concatenate([i[4] for i in res]),
178 |                              index=variant_ids, name='af')
179 |             logger.write('done.')
180 |             return pval_df, b_df, b_se_df, af_s
181 | 
182 | 
183 |     else:  # interaction model
184 |         dof = n_samples - 4 - covariates_df.shape[1]
185 |         interaction_t = torch.tensor(interaction_s.values.reshape(1,-1), dtype=torch.float32).to(device)  # 1 x n_samples
186 |         mask_s = pd.Series(True, index=interaction_s.index)
187 |         mask_s[interaction_s.sort_values(kind='mergesort').index[:interaction_s.shape[0]//2]] = False
188 |         interaction_mask_t = torch.BoolTensor(mask_s.values).to(device)
189 | 
190 |         ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
191 |         start_time = time.time()
192 |         if return_sparse:
193 | 
194 |             nps = phenotypes_t.shape[0]
195 |             i0_t = interaction_t - interaction_t.mean()
196 |             p0_t = phenotypes_t - phenotypes_t.mean(1, keepdim=True)
197 |             p0_t = residualizer.transform(p0_t, center=False)
198 |             i0_t = residualizer.transform(i0_t, center=False)
199 | 
200 |             tstat_g_list = []
201 |             tstat_i_list = []
202 |             tstat_gi_list = []
203 |             af_list = []
204 |             ix0 = []
205 |             ix1 = []
206 |             for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
207 |                 genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
208 |                 genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t],
209 |                                                              interaction_mask_t=interaction_mask_t,
210 |                                                              maf_threshold_interaction=maf_threshold)
211 |                 if genotypes_t.shape[0] > 0:
212 |                     ng, ns = genotypes_t.shape
213 | 
214 |                     # calculate allele frequency
215 |                     af_t = genotypes_t.sum(1) / (2*ns)
216 | 
217 |                     # centered inputs
218 |                     g0_t = genotypes_t - genotypes_t.mean(1, keepdim=True)
219 |                     gi_t = genotypes_t * interaction_t
220 |                     gi0_t = gi_t - gi_t.mean(1, keepdim=True)
221 |                     # residualize rows
222 |                     g0_t = residualizer.transform(g0_t, center=False)
223 |                     gi0_t = residualizer.transform(gi0_t, center=False)
224 | 
225 |                     # regression
226 |                     X_t = torch.stack([g0_t, i0_t.repeat(ng, 1), gi0_t], 2)  # ng x ns x 3
227 |                     Xinv = torch.matmul(torch.transpose(X_t, 1, 2), X_t).inverse() # ng x 3 x 3
228 |                     b_t = torch.matmul(torch.matmul(Xinv, torch.transpose(X_t, 1, 2)), p0_t.t())  # ng x 3 x np
229 |                     dof = residualizer.dof - 2
230 | 
231 |                     rss_t = (torch.matmul(X_t, b_t) - p0_t.t()).pow(2).sum(1)  # ng x np
232 |                     b_se_t = torch.sqrt(Xinv[:, torch.eye(3, dtype=torch.uint8).bool()].unsqueeze(-1).repeat([1,1,nps]) * rss_t.unsqueeze(1).repeat([1,3,1]) / dof)
233 |                     tstat_t = (b_t.double() / b_se_t.double()).float()  # (ng x 3 x np)
234 |                     tstat_g_t =  tstat_t[:,0,:]  # genotypes x phenotypes
235 |                     tstat_i_t =  tstat_t[:,1,:]
236 |                     tstat_gi_t = tstat_t[:,2,:]
237 |                     m = tstat_gi_t.abs() >= tstat_threshold
238 |                     tstat_g_t = tstat_g_t[m]
239 |                     tstat_i_t = tstat_i_t[m]
240 |                     tstat_gi_t = tstat_gi_t[m]
241 |                     ix = m.nonzero(as_tuple=False)  # indexes: [genotype, phenotype]
242 |                     af_t =  af_t[ix[:,0]]
243 | 
244 |                     res = [tstat_g_t, tstat_i_t, tstat_gi_t, af_t, ix]
245 |                     tstat_g, tstat_i, tstat_gi, af, ix = [i.cpu().numpy() for i in res]
246 |                     mask = mask_t.cpu().numpy()
247 |                     # convert sparse indexes
248 |                     if len(ix)>0:
249 |                         variant_ids = variant_ids[mask.astype(bool)]
250 |                         tstat_g_list.append(tstat_g)
251 |                         tstat_i_list.append(tstat_i)
252 |                         tstat_gi_list.append(tstat_gi)
253 |                         af_list.append(af)
254 |                         ix0.extend(variant_ids[ix[:,0]].tolist())
255 |                         ix1.extend(phenotype_df.index[ix[:,1]].tolist())
256 | 
257 |             logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
258 | 
259 |             # concatenate
260 |             pval_g =  get_t_pval(np.concatenate(tstat_g_list), dof, log=logp)
261 |             pval_i =  get_t_pval(np.concatenate(tstat_i_list), dof, log=logp)
262 |             pval_gi = get_t_pval(np.concatenate(tstat_gi_list), dof, log=logp)
263 |             af = np.concatenate(af_list)
264 | 
265 |             pval_df = pd.DataFrame(np.c_[ix0, ix1, pval_g, pval_i, pval_gi, af],
266 |                                    columns=['variant_id', 'phenotype_id', 'pval_g', 'pval_i', 'pval_gi', 'af']
267 |                                    ).astype({'pval_g':np.float64, 'pval_i':np.float64, 'pval_gi':np.float64, 'af':np.float32})
268 |             return pval_df
269 |         else:  # dense output
270 |             output_list = []
271 |             for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
272 |                 genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
273 |                 genotypes_t, mask_t = filter_maf_interaction(genotypes_t[:, genotype_ix_t],
274 |                                                              interaction_mask_t=interaction_mask_t,
275 |                                                              maf_threshold_interaction=maf_threshold)
276 |                 res = calculate_interaction_nominal(genotypes_t, phenotypes_t, interaction_t.t(), residualizer,
277 |                                                     return_sparse=return_sparse)
278 |                 # res: tstat, b, b_se, af, ma_samples, ma_count
279 |                 res = [i.cpu().numpy() for i in res]
280 |                 mask = mask_t.cpu().numpy()
281 |                 variant_ids = variant_ids[mask.astype(bool)]
282 |                 output_list.append(res + [variant_ids])
283 |             logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
284 | 
285 |             # concatenate outputs
286 |             tstat = np.concatenate([i[0] for i in output_list])
287 |             pval = get_t_pval(tstat, dof, log=logp)
288 |             b = np.concatenate([i[1] for i in output_list])
289 |             b_se = np.concatenate([i[2] for i in output_list])
290 |             af = np.concatenate([i[3] for i in output_list])
291 |             ma_samples = np.concatenate([i[4] for i in output_list])
292 |             ma_count = np.concatenate([i[5] for i in output_list])
293 |             variant_ids = np.concatenate([i[6] for i in output_list])
294 | 
295 |             pval_g_df = pd.DataFrame(pval[:,0,:], index=variant_ids, columns=phenotype_df.index)
296 |             pval_i_df = pd.DataFrame(pval[:,1,:], index=variant_ids, columns=phenotype_df.index)
297 |             pval_gi_df = pd.DataFrame(pval[:,2,:], index=variant_ids, columns=phenotype_df.index)
298 |             af_s = pd.Series(af, index=variant_ids, name='af').astype(np.float32)
299 |             ma_samples_s = pd.Series(ma_samples, index=variant_ids, name='ma_samples').astype(np.int32)
300 |             ma_count_s = pd.Series(ma_count, index=variant_ids, name='ma_counts').astype(np.int32)
301 |             return pval_g_df, pval_i_df, pval_gi_df, af_s, ma_samples_s, ma_count_s
302 | 
303 | 
304 | def map_permutations(genotype_df, covariates_df, permutations=None,
305 |                      chr_s=None, nperms=10000, maf_threshold=0.05,
306 |                      batch_size=20000, logger=None, seed=None, verbose=True):
307 |     """
308 | 
309 | 
310 |     Warning: this function assumes that all phenotypes are normally distributed,
311 |              e.g., inverse normal transformed
312 |     """
313 | 
314 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
315 | 
316 |     if logger is None:
317 |         logger = SimpleLogger()
318 |     assert covariates_df.index.isin(genotype_df.columns).all()
319 |     sample_ids = covariates_df.index.values
320 | 
321 |     variant_ids = genotype_df.index.tolist()
322 | 
323 |     # index of VCF samples corresponding to phenotypes
324 |     genotype_ix = np.array([genotype_df.columns.tolist().index(i) for i in sample_ids])
325 |     genotype_ix_t = torch.from_numpy(genotype_ix).to(device)
326 | 
327 |     n_variants = len(variant_ids)
328 |     n_samples = len(sample_ids)
329 |     dof = n_samples - 2 - covariates_df.shape[1]
330 | 
331 |     logger.write('trans-QTL mapping (permutations)')
332 |     logger.write(f'  * {n_samples} samples')
333 |     logger.write(f'  * {covariates_df.shape[1]} covariates')
334 |     logger.write(f'  * {n_variants} variants')
335 | 
336 |     if permutations is None:  # generate permutations assuming normal distribution
337 |         q = stats.norm.ppf(np.arange(1,n_samples+1)/(n_samples+1))
338 |         permutations = np.tile(q,[nperms,1])
339 |         if seed is not None:
340 |             np.random.seed(seed)
341 |         for i in np.arange(nperms):
342 |             np.random.shuffle(permutations[i,:])
343 |     else:
344 |         assert permutations.shape[1]==n_samples
345 |         nperms = permutations.shape[0]
346 |         logger.write(f'  * {nperms} permutations')
347 | 
348 |     permutations_t = torch.tensor(permutations, dtype=torch.float32).to(device)
349 |     residualizer = Residualizer(torch.tensor(covariates_df.values, dtype=torch.float32).to(device))
350 | 
351 |     if chr_s is not None:
352 |         assert chr_s.index.equals(genotype_df.index)
353 |         start_time = time.time()
354 |         n_variants = 0
355 |         ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size, chr_s=chr_s)
356 |         total_batches = np.sum([len(ggt.chr_batch_indexes[c]) for c in ggt.chroms])
357 | 
358 |         chr_max_r2 = OrderedDict()
359 |         k = 0
360 |         for chrom in ggt.chroms:
361 |             max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
362 |             for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(chrom=chrom, verbose=verbose, enum_start=k+1), k+1):
363 |                 genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
364 |                 genotypes_t = genotypes_t[:, genotype_ix_t]
365 |                 impute_mean(genotypes_t)
366 |                 genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold)
367 |                 n_variants += genotypes_t.shape[0]
368 | 
369 |                 r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2)
370 |                 del genotypes_t
371 |                 m,_ = r2_t.max(0)
372 |                 max_r2_t = torch.max(m, max_r2_t)
373 |             chr_max_r2[chrom] = max_r2_t.cpu()
374 |         logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
375 |         if maf_threshold > 0:
376 |             logger.write(f'  * {n_variants} variants passed MAF >= {maf_threshold} filtering')
377 |         chr_max_r2 = pd.DataFrame(chr_max_r2)
378 | 
379 |         # leave-one-out max
380 |         max_r2 = OrderedDict()
381 |         for c in chr_max_r2:
382 |             max_r2[c] = chr_max_r2[np.setdiff1d(chr_max_r2.columns, c)].max(1)
383 |         max_r2 = pd.DataFrame(max_r2)  # nperms x chrs
384 | 
385 |         # empirical p-values
386 |         tstat = np.sqrt( dof*max_r2 / (1-max_r2) )
387 |         minp_empirical = pd.DataFrame(2*stats.t.cdf(-np.abs(tstat), dof), columns=tstat.columns)  # nperms x chrs
388 | 
389 |         beta_shape1 = OrderedDict()
390 |         beta_shape2 = OrderedDict()
391 |         true_dof = OrderedDict()
392 |         minp_vec = OrderedDict()
393 |         for c in max_r2:
394 |             beta_shape1[c], beta_shape2[c], true_dof[c], minp_vec[c] = fit_beta_parameters(max_r2[c], dof, return_minp=True)
395 | 
396 |         beta_df = pd.DataFrame(OrderedDict([
397 |             ('beta_shape1', beta_shape1),
398 |             ('beta_shape2', beta_shape2),
399 |             ('true_df', true_dof),
400 |             ('minp_true_df', minp_vec),
401 |             ('minp_empirical', {c:minp_empirical[c].values for c in minp_empirical}),
402 |         ]))
403 |         return beta_df
404 | 
405 |     else:  # not split_chr
406 |         ggt = genotypeio.GenotypeGeneratorTrans(genotype_df, batch_size=batch_size)
407 |         start_time = time.time()
408 |         max_r2_t = torch.FloatTensor(nperms).fill_(0).to(device)
409 |         n_variants = 0
410 |         for k, (genotypes, variant_ids) in enumerate(ggt.generate_data(verbose=verbose), 1):
411 |             genotypes_t = torch.tensor(genotypes, dtype=torch.float).to(device)
412 |             genotypes_t = genotypes_t[:, genotype_ix_t]
413 |             impute_mean(genotypes_t)
414 |             genotypes_t, _, _ = filter_maf(genotypes_t, variant_ids, maf_threshold)
415 |             n_variants += genotypes_t.shape[0]
416 | 
417 |             r2_t = calculate_corr(genotypes_t, permutations_t, residualizer=residualizer).pow(2)
418 |             del genotypes_t
419 |             m,_ = r2_t.max(0)
420 |             max_r2_t = torch.max(m, max_r2_t)
421 |         logger.write(f'    time elapsed: {(time.time()-start_time)/60:.2f} min')
422 |         if maf_threshold > 0:
423 |             logger.write(f'  * {n_variants} variants passed MAF >= {maf_threshold} filtering')
424 |         max_r2 = max_r2_t.cpu().numpy().astype(np.float64)
425 |         tstat = np.sqrt( dof*max_r2 / (1-max_r2) )
426 |         minp_empirical = 2*stats.t.cdf(-np.abs(tstat), dof)
427 |         beta_shape1, beta_shape2, true_dof, minp_vec = fit_beta_parameters(max_r2, dof, tol=1e-4, return_minp=True)
428 | 
429 |         beta_s = pd.Series([n_samples, dof, beta_shape1, beta_shape2, true_dof, minp_vec, minp_empirical],
430 |             index=['num_samples', 'df', 'beta_shape1', 'beta_shape2', 'true_df', 'minp_true_df', 'minp_empirical'])
431 |         return beta_s
432 | 
433 | 
434 | def apply_permutations(res, pairs_df):
435 |     """
436 |       res: output from map_permutations()
437 |       pairs_df: output from map_trans()
438 |     """
439 | 
440 |     if isinstance(res, pd.Series):  # chrs not split
441 |         nperms = len(res['minp_true_df'])
442 |         for k in ['beta_shape1', 'beta_shape2', 'true_df']:
443 |             pairs_df[k] = res[k]
444 |         pairs_df['pval_true_dof'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df'])
445 |         pairs_df['pval_perm'] = np.array([(np.sum(res['minp_empirical']<=p)+1)/(nperms+1) for p in pairs_df['pval']])
446 |         pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_dof'], pairs_df['beta_shape1'], pairs_df['beta_shape2'])
447 | 
448 |     elif isinstance(res, pd.DataFrame):  #  chrs split
449 |         nperms = len(res['minp_empirical'][0])
450 |         for k in ['beta_shape1', 'beta_shape2', 'true_df']:
451 |             pairs_df[k] = res.loc[pairs_df['phenotype_chr'], k].values
452 |         pairs_df['pval_true_df'] = pval_from_corr(pairs_df['r2'], pairs_df['true_df'])
453 |         pairs_df['pval_perm'] = [(np.sum(pe<=p)+1)/(nperms+1) for p,pe in zip(pairs_df['pval'], res.loc[pairs_df['phenotype_chr'], 'minp_empirical'])]
454 |         # pval_perm = np.array([(np.sum(minp_empirical[chrom]<=p)+1)/(nperms+1) for p, chrom in zip(pval_df['pval'], pval_df['phenotype_chr'])])
455 |         # pval_perm = np.array([(np.sum(minp_empirical<=p)+1)/(nperms+1) for p in minp_nominal])
456 |         pairs_df['pval_beta'] = stats.beta.cdf(pairs_df['pval_true_df'], pairs_df['beta_shape1'], pairs_df['beta_shape2'])
457 | 


--------------------------------------------------------------------------------